[𝘀𝗽𝗿] changes introduced through rebaseupstream/users/ilovepi/spr/main.llvmdfa-jump-threading-add-option-to-allow-dfajumpthreading-when

Created using spr 1.3.4 [skip ci]
author: Paul Kirth <paulkirth@google.com> 2024-02-28 22:33:21 +0000
committer: Paul Kirth <paulkirth@google.com> 2024-02-28 22:33:21 +0000
commit: 25b6b3817ad385aad4f55c38537b811e61eec608 (patch)
tree: d58e0596746a2532d6c2a7de701b9982e745fb63
parent: b2ea075b59df25e93d21a57da9da146081b3408a (diff)
parent: 777ac46ddbc318b5d5820d278a2e4dc2213699d8 (diff)
1504 files changed, 38169 insertions, 20410 deletions
diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index 30336c4e3a74..741b1a36af86 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -997,6 +997,10 @@ public:
     return getUniqueSectionByName(".gdb_index");
   }
 
+  ErrorOr<BinarySection &> getDebugNamesSection() const {
+    return getUniqueSectionByName(".debug_names");
+  }
+
   /// @}
 
   /// Register \p TargetFunction as a fragment of \p Function if checks pass:
diff --git a/bolt/include/bolt/Core/DIEBuilder.h b/bolt/include/bolt/Core/DIEBuilder.h
index f0db924e2ccb..4debf9363354 100644
--- a/bolt/include/bolt/Core/DIEBuilder.h
+++ b/bolt/include/bolt/Core/DIEBuilder.h
@@ -16,6 +16,7 @@
 #define BOLT_CORE_DIE_BUILDER_H
 
 #include "bolt/Core/BinaryContext.h"
+#include "bolt/Core/DebugNames.h"
 #include "llvm/CodeGen/DIE.h"
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
@@ -127,6 +128,7 @@ private:
   DWARFUnit *SkeletonCU{nullptr};
   uint64_t UnitSize{0};
   llvm::DenseSet<uint64_t> AllProcessed;
+  DWARF5AcceleratorTable &DebugNamesTable;
 
   /// Returns current state of the DIEBuilder
   State &getState() { return *BuilderState.get(); }
@@ -206,8 +208,8 @@ private:
   /// Update references once the layout is finalized.
   void updateReferences();
 
-  /// Update the Offset and Size of DIE.
-  uint32_t computeDIEOffset(const DWARFUnit &CU, DIE &Die, uint32_t &CurOffset);
+  /// Update the Offset and Size of DIE, populate DebugNames table.
+  uint32_t finalizeDIEs(DWARFUnit &CU, DIE &Die, uint32_t &CurOffset);
 
   void registerUnit(DWARFUnit &DU, bool NeedSort);
 
@@ -269,6 +271,7 @@ private:
 
 public:
   DIEBuilder(BinaryContext &BC, DWARFContext *DwarfContext,
+             DWARF5AcceleratorTable &DebugNamesTable,
              DWARFUnit *SkeletonCU = nullptr);
 
   /// Returns enum to what we are currently processing.
diff --git a/bolt/include/bolt/Core/DebugData.h b/bolt/include/bolt/Core/DebugData.h
index 48b813a4ca11..7d10b208dc83 100644
--- a/bolt/include/bolt/Core/DebugData.h
+++ b/bolt/include/bolt/Core/DebugData.h
@@ -439,6 +439,8 @@ public:
   /// Update Str offset in .debug_str in .debug_str_offsets.
   void updateAddressMap(uint32_t Index, uint32_t Address);
 
+  /// Get offset for given index in original .debug_str_offsets section.
+  uint64_t getOffset(uint32_t Index) const { return StrOffsets[Index]; }
   /// Writes out current sections entry into .debug_str_offsets.
   void finalizeSection(DWARFUnit &Unit, DIEBuilder &DIEBldr);
 
@@ -463,7 +465,7 @@ private:
   std::unique_ptr<DebugStrOffsetsBufferVector> StrOffsetsBuffer;
   std::unique_ptr<raw_svector_ostream> StrOffsetsStream;
   std::map<uint32_t, uint32_t> IndexToAddressMap;
-  std::vector<uint32_t> StrOffsets;
+  SmallVector<uint32_t, 5> StrOffsets;
   std::unordered_map<uint64_t, uint64_t> ProcessedBaseOffsets;
   bool StrOffsetSectionWasModified = false;
 };
@@ -484,11 +486,12 @@ public:
   /// Returns False if no strings were added to .debug_str.
   bool isInitialized() const { return !StrBuffer->empty(); }
 
+  /// Initializes Buffer and Stream.
+  void initialize();
+
 private:
   /// Mutex used for parallel processing of debug info.
   std::mutex WriterMutex;
-  /// Initializes Buffer and Stream.
-  void initialize();
   /// Creates internal data structures.
   void create();
   std::unique_ptr<DebugStrBufferVector> StrBuffer;
diff --git a/bolt/include/bolt/Core/DebugNames.h b/bolt/include/bolt/Core/DebugNames.h
new file mode 100644
index 000000000000..84c448aa1354
--- /dev/null
+++ b/bolt/include/bolt/Core/DebugNames.h
@@ -0,0 +1,172 @@
+//===- bolt/Core/DebugNames.h - Debug names support ---*- C++
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains declaration of classes required for generation of
+// .debug_names section.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BOLT_CORE_DEBUG_NAMES_H
+#define BOLT_CORE_DEBUG_NAMES_H
+
+#include "DebugData.h"
+#include "llvm/CodeGen/AccelTable.h"
+
+namespace llvm {
+namespace bolt {
+class BOLTDWARF5AccelTableData : public DWARF5AccelTableData {
+public:
+  BOLTDWARF5AccelTableData(const uint64_t DieOffset,
+                           const std::optional<uint64_t> DefiningParentOffset,
+                           const unsigned DieTag, const unsigned UnitID,
+                           const bool IsTU,
+                           const std::optional<unsigned> SecondUnitID)
+      : DWARF5AccelTableData(DieOffset, DefiningParentOffset, DieTag, UnitID,
+                             IsTU),
+        SecondUnitID(SecondUnitID) {}
+
+  uint64_t getDieOffset() const { return DWARF5AccelTableData::getDieOffset(); }
+  unsigned getDieTag() const { return DWARF5AccelTableData::getDieTag(); }
+  unsigned getUnitID() const { return DWARF5AccelTableData::getUnitID(); }
+  bool isTU() const { return DWARF5AccelTableData::isTU(); }
+  std::optional<unsigned> getSecondUnitID() const { return SecondUnitID; }
+
+private:
+  std::optional<unsigned> SecondUnitID;
+};
+
+class DWARF5AcceleratorTable {
+public:
+  DWARF5AcceleratorTable(const bool CreateDebugNames, BinaryContext &BC,
+                         DebugStrWriter &MainBinaryStrWriter);
+  ~DWARF5AcceleratorTable() {
+    for (DebugNamesAbbrev *Abbrev : AbbreviationsVector)
+      Abbrev->~DebugNamesAbbrev();
+  }
+  /// Add DWARF5 Accelerator table entry.
+  /// Input is DWARFUnit being processed, DIE that belongs to it, and potential
+  /// SkeletonCU if the Unit comes from a DWO section.
+  void addAccelTableEntry(DWARFUnit &Unit, const DIE &Die,
+                          const std::optional<uint64_t> &DWOID);
+  /// Set current unit being processed.
+  void setCurrentUnit(DWARFUnit &Unit, const uint64_t UnitStartOffset);
+  /// Emit Accelerator table.
+  void emitAccelTable();
+  /// Returns true if the table was crated.
+  bool isCreated() const { return NeedToCreate; }
+  /// Returns buffer containing the accelerator table.
+  std::unique_ptr<DebugBufferVector> releaseBuffer() {
+    return std::move(FullTableBuffer);
+  }
+
+private:
+  BinaryContext &BC;
+  bool NeedToCreate = false;
+  BumpPtrAllocator Allocator;
+  DebugStrWriter &MainBinaryStrWriter;
+  StringRef StrSection;
+  uint64_t CurrentUnitOffset = 0;
+  const DWARFUnit *CurrentUnit = nullptr;
+  std::unordered_map<uint32_t, uint32_t> AbbrevTagToIndexMap;
+
+  /// Represents a group of entries with identical name (and hence, hash value).
+  struct HashData {
+    uint64_t StrOffset;
+    uint32_t HashValue;
+    uint32_t EntryOffset;
+    std::vector<BOLTDWARF5AccelTableData *> Values;
+  };
+  using HashList = std::vector<HashData *>;
+  using BucketList = std::vector<HashList>;
+  /// Contains all the offsets of CUs.
+  SmallVector<uint32_t, 1> CUList;
+  /// Contains all the offsets of local TUs.
+  SmallVector<uint32_t, 1> LocalTUList;
+  /// Contains all the type hashes for split dwarf TUs.
+  SmallVector<uint64_t, 1> ForeignTUList;
+  using StringEntries =
+      MapVector<std::string, HashData, llvm::StringMap<unsigned>>;
+  StringEntries Entries;
+  /// FoldingSet that uniques the abbreviations.
+  FoldingSet<DebugNamesAbbrev> AbbreviationsSet;
+  /// Vector containing DebugNames abbreviations for iteration in order.
+  SmallVector<DebugNamesAbbrev *, 5> AbbreviationsVector;
+  /// The bump allocator to use when creating DIEAbbrev objects in the uniqued
+  /// storage container.
+  BumpPtrAllocator Alloc;
+  uint32_t BucketCount = 0;
+  uint32_t UniqueHashCount = 0;
+  uint32_t AbbrevTableSize = 0;
+  uint32_t CUIndexEncodingSize = 4;
+  uint32_t TUIndexEncodingSize = 4;
+  uint32_t AugmentationStringSize = 0;
+  dwarf::Form CUIndexForm = dwarf::DW_FORM_data4;
+  dwarf::Form TUIndexForm = dwarf::DW_FORM_data4;
+
+  BucketList Buckets;
+
+  std::unique_ptr<DebugBufferVector> FullTableBuffer;
+  std::unique_ptr<raw_svector_ostream> FullTableStream;
+  std::unique_ptr<DebugBufferVector> StrBuffer;
+  std::unique_ptr<raw_svector_ostream> StrStream;
+  std::unique_ptr<DebugBufferVector> EntriesBuffer;
+  std::unique_ptr<raw_svector_ostream> Entriestream;
+  std::unique_ptr<DebugBufferVector> AugStringBuffer;
+  std::unique_ptr<raw_svector_ostream> AugStringtream;
+  llvm::DenseMap<llvm::hash_code, uint64_t> StrCacheToOffsetMap;
+  // Contains DWO ID to CUList Index.
+  llvm::DenseMap<uint64_t, uint32_t> CUOffsetsToPatch;
+  /// Adds Unit to either CUList, LocalTUList or ForeignTUList.
+  /// Input Unit being processed, and DWO ID if Unit is being processed comes
+  /// from a DWO section.
+  void addUnit(DWARFUnit &Unit, const std::optional<uint64_t> &DWOID);
+  /// Returns number of buckets in .debug_name table.
+  ArrayRef<HashList> getBuckets() const { return Buckets; }
+  /// Get encoding for a given attribute.
+  std::optional<DWARF5AccelTable::UnitIndexAndEncoding>
+  getIndexForEntry(const BOLTDWARF5AccelTableData &Value) const;
+  /// Get encoding for a given attribute for second index.
+  /// Returns nullopt if there is no second index.
+  std::optional<DWARF5AccelTable::UnitIndexAndEncoding>
+  getSecondIndexForEntry(const BOLTDWARF5AccelTableData &Value) const;
+  /// Uniquify Entries.
+  void finalize();
+  /// Computes bucket count.
+  void computeBucketCount();
+  /// Populate Abbreviations Map.
+  void populateAbbrevsMap();
+  /// Write Entries.
+  void writeEntries();
+  /// Write an Entry.
+  void writeEntry(const BOLTDWARF5AccelTableData &Entry);
+  /// Write augmentation_string for BOLT.
+  void writeAugmentationString();
+  /// Emit out Header for DWARF5 Accelerator table.
+  void emitHeader() const;
+  /// Emit out CU list.
+  void emitCUList() const;
+  /// Emit out TU List. Combination of LocalTUList and ForeignTUList.
+  void emitTUList() const;
+  /// Emit buckets.
+  void emitBuckets() const;
+  /// Emit hashes for hash table.
+  void emitHashes() const;
+  /// Emit string offsets for hash table.
+  void emitStringOffsets() const;
+  /// Emit Entry Offsets for hash table.
+  void emitOffsets() const;
+  /// Emit abbreviation table.
+  void emitAbbrevs();
+  /// Emit entries.
+  void emitData();
+  /// Emit augmentation string.
+  void emitAugmentationString() const;
+};
+} // namespace bolt
+} // namespace llvm
+#endif
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index 687b49a3cbda..6bb76d1b917d 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -620,7 +620,17 @@ public:
     return Info->get(Inst.getOpcode()).mayStore();
   }
 
-  virtual bool isAArch64Exclusive(const MCInst &Inst) const {
+  virtual bool isAArch64ExclusiveLoad(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isAArch64ExclusiveStore(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isAArch64ExclusiveClear(const MCInst &Inst) const {
     llvm_unreachable("not implemented");
     return false;
   }
@@ -1173,11 +1183,16 @@ public:
   bool clearOffset(MCInst &Inst) const;
 
   /// Return the label of \p Inst, if available.
-  MCSymbol *getLabel(const MCInst &Inst) const;
+  MCSymbol *getInstLabel(const MCInst &Inst) const;
+
+  /// Set the label of \p Inst or return the existing label for the instruction.
+  /// This label will be emitted right before \p Inst is emitted to MCStreamer.
+  MCSymbol *getOrCreateInstLabel(MCInst &Inst, const Twine &Name,
+                                 MCContext *Ctx) const;
 
   /// Set the label of \p Inst. This label will be emitted right before \p Inst
   /// is emitted to MCStreamer.
-  bool setLabel(MCInst &Inst, MCSymbol *Label) const;
+  void setInstLabel(MCInst &Inst, MCSymbol *Label) const;
 
   /// Get instruction size specified via annotation.
   std::optional<uint32_t> getSize(const MCInst &Inst) const;
diff --git a/bolt/include/bolt/Rewrite/DWARFRewriter.h b/bolt/include/bolt/Rewrite/DWARFRewriter.h
index ba6775f99ce6..20972f3d0b85 100644
--- a/bolt/include/bolt/Rewrite/DWARFRewriter.h
+++ b/bolt/include/bolt/Rewrite/DWARFRewriter.h
@@ -11,6 +11,7 @@
 
 #include "bolt/Core/DIEBuilder.h"
 #include "bolt/Core/DebugData.h"
+#include "bolt/Core/DebugNames.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/DIE.h"
 #include "llvm/DWP/DWP.h"
@@ -140,8 +141,10 @@ private:
                             const std::list<DWARFUnit *> &CUs);
 
   /// Finalize debug sections in the main binary.
-  void finalizeDebugSections(DIEBuilder &DIEBlder, DIEStreamer &Streamer,
-                             raw_svector_ostream &ObjOS, CUOffsetMap &CUMap);
+  void finalizeDebugSections(DIEBuilder &DIEBlder,
+                             DWARF5AcceleratorTable &DebugNamesTable,
+                             DIEStreamer &Streamer, raw_svector_ostream &ObjOS,
+                             CUOffsetMap &CUMap);
 
   /// Patches the binary for DWARF address ranges (e.g. in functions and lexical
   /// blocks) to be updated.
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index d544ece13a83..b29ebbbfa18c 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -1967,7 +1967,7 @@ void BinaryContext::printInstruction(raw_ostream &OS, const MCInst &Instruction,
     OS << " # Offset: " << *Offset;
   if (std::optional<uint32_t> Size = MIB->getSize(Instruction))
     OS << " # Size: " << *Size;
-  if (MCSymbol *Label = MIB->getLabel(Instruction))
+  if (MCSymbol *Label = MIB->getInstLabel(Instruction))
     OS << " # Label: " << *Label;
 
   MIB->printAnnotations(Instruction, OS);
diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp
index d4b668c1d7e7..97d19b75200f 100644
--- a/bolt/lib/Core/BinaryEmitter.cpp
+++ b/bolt/lib/Core/BinaryEmitter.cpp
@@ -489,7 +489,7 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, FunctionFragment &FF,
 
       if (!EmitCodeOnly) {
         // A symbol to be emitted before the instruction to mark its location.
-        MCSymbol *InstrLabel = BC.MIB->getLabel(Instr);
+        MCSymbol *InstrLabel = BC.MIB->getInstLabel(Instr);
 
         if (opts::UpdateDebugSections && BF.getDWARFUnit()) {
           LastLocSeen = emitLineInfo(BF, Instr.getLoc(), LastLocSeen,
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 54f2f9d972a4..00df42c11e22 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -1424,7 +1424,7 @@ add_instruction:
     InstrMapType::iterator II = Instructions.find(Offset);
     assert(II != Instructions.end() && "reference to non-existing instruction");
 
-    BC.MIB->setLabel(II->second, Label);
+    BC.MIB->setInstLabel(II->second, Label);
   }
 
   // Reset symbolizer for the disassembler.
diff --git a/bolt/lib/Core/CMakeLists.txt b/bolt/lib/Core/CMakeLists.txt
index c913179ebcc5..441df9fe0846 100644
--- a/bolt/lib/Core/CMakeLists.txt
+++ b/bolt/lib/Core/CMakeLists.txt
@@ -20,6 +20,7 @@ add_llvm_library(LLVMBOLTCore
   BinaryFunctionProfile.cpp
   BinarySection.cpp
   DebugData.cpp
+  DebugNames.cpp
   DIEBuilder.cpp
   DynoStats.cpp
   Exceptions.cpp
diff --git a/bolt/lib/Core/DIEBuilder.cpp b/bolt/lib/Core/DIEBuilder.cpp
index e6104b81bf6c..42287978a857 100644
--- a/bolt/lib/Core/DIEBuilder.cpp
+++ b/bolt/lib/Core/DIEBuilder.cpp
@@ -19,20 +19,17 @@
 #include "llvm/DebugInfo/DWARF/DWARFTypeUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
-#include "llvm/ObjectYAML/DWARFYAML.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/Support/ThreadPool.h"
-#include "llvm/Support/YAMLTraits.h"
 
 #include <algorithm>
 #include <cstdint>
 #include <memory>
 #include <mutex>
-#include <string>
+#include <optional>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -179,8 +176,10 @@ void DIEBuilder::constructFromUnit(DWARFUnit &DU) {
 }
 
 DIEBuilder::DIEBuilder(BinaryContext &BC, DWARFContext *DwarfContext,
+                       DWARF5AcceleratorTable &DebugNamesTable,
                        DWARFUnit *SkeletonCU)
-    : BC(BC), DwarfContext(DwarfContext), SkeletonCU(SkeletonCU) {}
+    : BC(BC), DwarfContext(DwarfContext), SkeletonCU(SkeletonCU),
+      DebugNamesTable(DebugNamesTable) {}
 
 static unsigned int getCUNum(DWARFContext *DwarfContext, bool IsDWO) {
   unsigned int CUNum = IsDWO ? DwarfContext->getNumDWOCompileUnits()
@@ -378,18 +377,20 @@ getUnitForOffset(DIEBuilder &Builder, DWARFContext &DWCtx,
   return nullptr;
 }
 
-uint32_t DIEBuilder::computeDIEOffset(const DWARFUnit &CU, DIE &Die,
-                                      uint32_t &CurOffset) {
+uint32_t DIEBuilder::finalizeDIEs(DWARFUnit &CU, DIE &Die,
+                                  uint32_t &CurOffset) {
   getState().DWARFDieAddressesParsed.erase(Die.getOffset());
   uint32_t CurSize = 0;
   Die.setOffset(CurOffset);
+  DebugNamesTable.addAccelTableEntry(
+      CU, Die, SkeletonCU ? SkeletonCU->getDWOId() : std::nullopt);
   for (DIEValue &Val : Die.values())
     CurSize += Val.sizeOf(CU.getFormParams());
   CurSize += getULEB128Size(Die.getAbbrevNumber());
   CurOffset += CurSize;
 
   for (DIE &Child : Die.children()) {
-    uint32_t ChildSize = computeDIEOffset(CU, Child, CurOffset);
+    uint32_t ChildSize = finalizeDIEs(CU, Child, CurOffset);
     CurSize += ChildSize;
   }
   // for children end mark.
@@ -404,12 +405,12 @@ uint32_t DIEBuilder::computeDIEOffset(const DWARFUnit &CU, DIE &Die,
 }
 
 void DIEBuilder::finish() {
-  auto computeOffset = [&](const DWARFUnit &CU,
-                           uint64_t &UnitStartOffset) -> void {
+  auto finalizeCU = [&](DWARFUnit &CU, uint64_t &UnitStartOffset) -> void {
     DIE *UnitDIE = getUnitDIEbyUnit(CU);
     uint32_t HeaderSize = CU.getHeaderSize();
     uint32_t CurOffset = HeaderSize;
-    computeDIEOffset(CU, *UnitDIE, CurOffset);
+    DebugNamesTable.setCurrentUnit(CU, UnitStartOffset);
+    finalizeDIEs(CU, *UnitDIE, CurOffset);
 
     DWARFUnitInfo &CurUnitInfo = getUnitInfoByDwarfUnit(CU);
     CurUnitInfo.UnitOffset = UnitStartOffset;
@@ -420,18 +421,18 @@ void DIEBuilder::finish() {
   // It's processed first when CU is registered so will be at the begginnig of
   // the vector.
   uint64_t TypeUnitStartOffset = 0;
-  for (const DWARFUnit *CU : getState().DUList) {
+  for (DWARFUnit *CU : getState().DUList) {
     // We process DWARF$ types first.
     if (!(CU->getVersion() < 5 && CU->isTypeUnit()))
       break;
-    computeOffset(*CU, TypeUnitStartOffset);
+    finalizeCU(*CU, TypeUnitStartOffset);
   }
 
-  for (const DWARFUnit *CU : getState().DUList) {
+  for (DWARFUnit *CU : getState().DUList) {
     // Skipping DWARF4 types.
     if (CU->getVersion() < 5 && CU->isTypeUnit())
       continue;
-    computeOffset(*CU, UnitSize);
+    finalizeCU(*CU, UnitSize);
   }
   if (opts::Verbosity >= 1) {
     if (!getState().DWARFDieAddressesParsed.empty())
diff --git a/bolt/lib/Core/DebugNames.cpp b/bolt/lib/Core/DebugNames.cpp
new file mode 100644
index 000000000000..1a7792afbbd9
--- /dev/null
+++ b/bolt/lib/Core/DebugNames.cpp
@@ -0,0 +1,618 @@
+//===- bolt/Rewrite/DebugNames.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "bolt/Core/DebugNames.h"
+#include "bolt/Core/BinaryContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFExpression.h"
+#include "llvm/DebugInfo/DWARF/DWARFTypeUnit.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/LEB128.h"
+#include <cstdint>
+
+namespace llvm {
+namespace bolt {
+DWARF5AcceleratorTable::DWARF5AcceleratorTable(
+    const bool CreateDebugNames, BinaryContext &BC,
+    DebugStrWriter &MainBinaryStrWriter)
+    : BC(BC), MainBinaryStrWriter(MainBinaryStrWriter) {
+  NeedToCreate = CreateDebugNames || BC.getDebugNamesSection();
+  if (!NeedToCreate)
+    return;
+  FullTableBuffer = std::make_unique<DebugStrBufferVector>();
+  FullTableStream = std::make_unique<raw_svector_ostream>(*FullTableBuffer);
+  StrBuffer = std::make_unique<DebugStrBufferVector>();
+  StrStream = std::make_unique<raw_svector_ostream>(*StrBuffer);
+  EntriesBuffer = std::make_unique<DebugStrBufferVector>();
+  Entriestream = std::make_unique<raw_svector_ostream>(*EntriesBuffer);
+  AugStringBuffer = std::make_unique<DebugStrBufferVector>();
+  AugStringtream = std::make_unique<raw_svector_ostream>(*AugStringBuffer);
+
+  // Binary has split-dwarf CUs.
+  // Even thought for non-skeleton-cu all names are in .debug_str.dwo section,
+  // for the .debug_names contributions they are in .debug_str section.
+  if (BC.getNumDWOCUs()) {
+    DataExtractor StrData(BC.DwCtx->getDWARFObj().getStrSection(),
+                          BC.DwCtx->isLittleEndian(), 0);
+    uint64_t Offset = 0;
+    uint64_t StrOffset = 0;
+    while (StrData.isValidOffset(Offset)) {
+      Error Err = Error::success();
+      const char *CStr = StrData.getCStr(&Offset, &Err);
+      if (Err) {
+        NeedToCreate = false;
+        BC.errs() << "BOLT-WARNING: [internal-dwarf-error]: Could not extract "
+                     "string from .debug_str section at offset: "
+                  << Twine::utohexstr(StrOffset) << ".\n";
+        return;
+      }
+      auto R = StrCacheToOffsetMap.try_emplace(
+          llvm::hash_value(llvm::StringRef(CStr)), StrOffset);
+      if (!R.second)
+        BC.errs()
+            << "BOLT-WARNING: [internal-dwarf-error]: collision occured on "
+            << CStr << " at offset : 0x" << Twine::utohexstr(StrOffset)
+            << ". Previous string offset is: 0x"
+            << Twine::utohexstr(R.first->second) << ".\n";
+      StrOffset = Offset;
+    }
+  }
+}
+
+void DWARF5AcceleratorTable::setCurrentUnit(DWARFUnit &Unit,
+                                            const uint64_t UnitStartOffset) {
+  CurrentUnit = nullptr;
+  CurrentUnitOffset = UnitStartOffset;
+  std::optional<uint64_t> DWOID = Unit.getDWOId();
+  // We process skeleton CUs after DWO Units for it.
+  // Patching offset in CU list to correct one.
+  if (!Unit.isDWOUnit() && DWOID) {
+    auto Iter = CUOffsetsToPatch.find(*DWOID);
+    // Check in case no entries were added from non skeleton DWO section.
+    if (Iter != CUOffsetsToPatch.end())
+      CUList[Iter->second] = UnitStartOffset;
+  }
+}
+
+void DWARF5AcceleratorTable::addUnit(DWARFUnit &Unit,
+                                     const std::optional<uint64_t> &DWOID) {
+  constexpr uint32_t BADCUOFFSET = 0xBADBAD;
+  StrSection = Unit.getStringSection();
+  if (Unit.isTypeUnit()) {
+    if (DWOID) {
+      // We adding an entry for a DWO TU. The DWO CU might not have any entries,
+      // so need to add it to the list pre-emptively.
+      auto Iter = CUOffsetsToPatch.insert({*DWOID, CUList.size()});
+      if (Iter.second)
+        CUList.push_back(BADCUOFFSET);
+      ForeignTUList.push_back(cast<DWARFTypeUnit>(&Unit)->getTypeHash());
+    } else {
+      LocalTUList.push_back(CurrentUnitOffset);
+    }
+  } else {
+    if (DWOID) {
+      // This is a path for split dwarf without type units.
+      // We process DWO Units before Skeleton CU. So at this point we don't know
+      // the offset of Skeleton CU. Adding CULit index to a map to patch later
+      // with the correct offset.
+      auto Iter = CUOffsetsToPatch.insert({*DWOID, CUList.size()});
+      if (Iter.second)
+        CUList.push_back(BADCUOFFSET);
+    } else {
+      CUList.push_back(CurrentUnitOffset);
+    }
+  }
+}
+
+// Returns true if DW_TAG_variable should be included in .debug-names based on
+// section 6.1.1.1 for DWARF5 spec.
+static bool shouldIncludeVariable(const DWARFUnit &Unit, const DIE &Die) {
+  if (Die.findAttribute(dwarf::Attribute::DW_AT_declaration))
+    return false;
+  const DIEValue LocAttrInfo =
+      Die.findAttribute(dwarf::Attribute::DW_AT_location);
+  if (!LocAttrInfo)
+    return false;
+  if (!(doesFormBelongToClass(LocAttrInfo.getForm(), DWARFFormValue::FC_Exprloc,
+                              Unit.getVersion()) ||
+        doesFormBelongToClass(LocAttrInfo.getForm(), DWARFFormValue::FC_Block,
+                              Unit.getVersion())))
+    return false;
+  std::vector<uint8_t> Sblock;
+  auto constructVect =
+      [&](const DIEValueList::const_value_range &Iter) -> void {
+    for (const DIEValue &Val : Iter)
+      Sblock.push_back(Val.getDIEInteger().getValue());
+  };
+  if (doesFormBelongToClass(LocAttrInfo.getForm(), DWARFFormValue::FC_Exprloc,
+                            Unit.getVersion()))
+    constructVect(LocAttrInfo.getDIELoc().values());
+  else
+    constructVect(LocAttrInfo.getDIEBlock().values());
+  ArrayRef<uint8_t> Expr = ArrayRef<uint8_t>(Sblock);
+  DataExtractor Data(StringRef((const char *)Expr.data(), Expr.size()),
+                     Unit.getContext().isLittleEndian(), 0);
+  DWARFExpression LocExpr(Data, Unit.getAddressByteSize(),
+                          Unit.getFormParams().Format);
+  for (const DWARFExpression::Operation &Expr : LocExpr)
+    if (Expr.getCode() == dwarf::DW_OP_addrx ||
+        Expr.getCode() == dwarf::DW_OP_form_tls_address)
+      return true;
+  return false;
+}
+
+/// Returns name offset in String Offset section.
+static uint64_t getNameOffset(BinaryContext &BC, DWARFUnit &Unit,
+                              const uint64_t Index) {
+  const DWARFSection &StrOffsetsSection = Unit.getStringOffsetSection();
+  const std::optional<StrOffsetsContributionDescriptor> &Contr =
+      Unit.getStringOffsetsTableContribution();
+  if (!Contr) {
+    BC.errs() << "BOLT-WARNING: [internal-dwarf-warning]: Could not get "
+                 "StringOffsetsTableContribution for unit at offset: "
+              << Twine::utohexstr(Unit.getOffset()) << ".\n";
+    return 0;
+  }
+
+  const uint8_t DwarfOffsetByteSize = Contr->getDwarfOffsetByteSize();
+  return support::endian::read32le(StrOffsetsSection.Data.data() + Contr->Base +
+                                   Index * DwarfOffsetByteSize);
+}
+
+void DWARF5AcceleratorTable::addAccelTableEntry(
+    DWARFUnit &Unit, const DIE &Die, const std::optional<uint64_t> &DWOID) {
+  if (Unit.getVersion() < 5 || !NeedToCreate)
+    return;
+  std::string NameToUse = "";
+  auto canProcess = [&](const DIE &Die) -> bool {
+    switch (Die.getTag()) {
+    case dwarf::DW_TAG_base_type:
+    case dwarf::DW_TAG_class_type:
+    case dwarf::DW_TAG_enumeration_type:
+    case dwarf::DW_TAG_imported_declaration:
+    case dwarf::DW_TAG_pointer_type:
+    case dwarf::DW_TAG_structure_type:
+    case dwarf::DW_TAG_typedef:
+    case dwarf::DW_TAG_unspecified_type:
+      if (Die.findAttribute(dwarf::Attribute::DW_AT_name))
+        return true;
+      return false;
+    case dwarf::DW_TAG_namespace:
+      // According to DWARF5 spec namespaces without DW_AT_name needs to have
+      // "(anonymous namespace)"
+      if (!Die.findAttribute(dwarf::Attribute::DW_AT_name))
+        NameToUse = "(anonymous namespace)";
+      return true;
+    case dwarf::DW_TAG_inlined_subroutine:
+    case dwarf::DW_TAG_label:
+    case dwarf::DW_TAG_subprogram:
+      if (Die.findAttribute(dwarf::Attribute::DW_AT_low_pc) ||
+          Die.findAttribute(dwarf::Attribute::DW_AT_high_pc) ||
+          Die.findAttribute(dwarf::Attribute::DW_AT_ranges) ||
+          Die.findAttribute(dwarf::Attribute::DW_AT_entry_pc))
+        return true;
+      return false;
+    case dwarf::DW_TAG_variable:
+      return shouldIncludeVariable(Unit, Die);
+    default:
+      break;
+    }
+    return false;
+  };
+
+  auto getUnitID = [&](const DWARFUnit &Unit, bool &IsTU,
+                       uint32_t &DieTag) -> uint32_t {
+    IsTU = Unit.isTypeUnit();
+    DieTag = Die.getTag();
+    if (IsTU) {
+      if (DWOID)
+        return ForeignTUList.size() - 1;
+      return LocalTUList.size() - 1;
+    }
+    return CUList.size() - 1;
+  };
+
+  if (!canProcess(Die))
+    return;
+
+  // Addes a Unit to either CU, LocalTU or ForeignTU list the first time we
+  // encounter it.
+  // Invoking it here so that we don't add Units that don't have any entries.
+  if (&Unit != CurrentUnit) {
+    CurrentUnit = &Unit;
+    addUnit(Unit, DWOID);
+  }
+
+  auto addEntry = [&](DIEValue ValName) -> void {
+    if ((!ValName || ValName.getForm() == dwarf::DW_FORM_string) &&
+        NameToUse.empty())
+      return;
+    std::string Name = "";
+    uint64_t NameIndexOffset = 0;
+    if (NameToUse.empty()) {
+      NameIndexOffset = ValName.getDIEInteger().getValue();
+      if (ValName.getForm() != dwarf::DW_FORM_strp)
+        NameIndexOffset = getNameOffset(BC, Unit, NameIndexOffset);
+      // Counts on strings end with '\0'.
+      Name = std::string(&StrSection.data()[NameIndexOffset]);
+    } else {
+      Name = NameToUse;
+    }
+    auto &It = Entries[Name];
+    if (It.Values.empty()) {
+      if (DWOID && NameToUse.empty()) {
+        // For DWO Unit the offset is in the .debug_str.dwo section.
+        // Need to find offset for the name in the .debug_str section.
+        llvm::hash_code Hash = llvm::hash_value(llvm::StringRef(Name));
+        auto ItCache = StrCacheToOffsetMap.find(Hash);
+        if (ItCache == StrCacheToOffsetMap.end())
+          NameIndexOffset = MainBinaryStrWriter.addString(Name);
+        else
+          NameIndexOffset = ItCache->second;
+      }
+      if (!NameToUse.empty())
+        NameIndexOffset = MainBinaryStrWriter.addString(Name);
+      It.StrOffset = NameIndexOffset;
+      // This the same hash function used in DWARF5AccelTableData.
+      It.HashValue = caseFoldingDjbHash(Name);
+    }
+
+    bool IsTU = false;
+    uint32_t DieTag = 0;
+    uint32_t UnitID = getUnitID(Unit, IsTU, DieTag);
+    std::optional<unsigned> SecondIndex = std::nullopt;
+    if (IsTU && DWOID) {
+      auto Iter = CUOffsetsToPatch.find(*DWOID);
+      if (Iter == CUOffsetsToPatch.end())
+        BC.errs() << "BOLT-WARNING: [internal-dwarf-warning]: Could not find "
+                     "DWO ID in CU offsets for second Unit Index "
+                  << Name << ". For DIE at offset: "
+                  << Twine::utohexstr(CurrentUnitOffset + Die.getOffset())
+                  << ".\n";
+      SecondIndex = Iter->second;
+    }
+    It.Values.push_back(new (Allocator) BOLTDWARF5AccelTableData(
+        Die.getOffset(), std::nullopt, DieTag, UnitID, IsTU, SecondIndex));
+  };
+
+  addEntry(Die.findAttribute(dwarf::Attribute::DW_AT_name));
+  addEntry(Die.findAttribute(dwarf::Attribute::DW_AT_linkage_name));
+  return;
+}
+
+/// Algorithm from llvm implementation.
+void DWARF5AcceleratorTable::computeBucketCount() {
+  // First get the number of unique hashes.
+  std::vector<uint32_t> Uniques;
+  Uniques.reserve(Entries.size());
+  for (const auto &E : Entries)
+    Uniques.push_back(E.second.HashValue);
+  array_pod_sort(Uniques.begin(), Uniques.end());
+  std::vector<uint32_t>::iterator P =
+      std::unique(Uniques.begin(), Uniques.end());
+
+  UniqueHashCount = std::distance(Uniques.begin(), P);
+
+  if (UniqueHashCount > 1024)
+    BucketCount = UniqueHashCount / 4;
+  else if (UniqueHashCount > 16)
+    BucketCount = UniqueHashCount / 2;
+  else
+    BucketCount = std::max<uint32_t>(UniqueHashCount, 1);
+}
+
+/// Bucket code as in: AccelTableBase::finalize()
+void DWARF5AcceleratorTable::finalize() {
+  if (!NeedToCreate)
+    return;
+  // Figure out how many buckets we need, then compute the bucket contents and
+  // the final ordering. The hashes and offsets can be emitted by walking these
+  // data structures.
+  computeBucketCount();
+
+  // Compute bucket contents and final ordering.
+  Buckets.resize(BucketCount);
+  for (auto &E : Entries) {
+    uint32_t Bucket = E.second.HashValue % BucketCount;
+    Buckets[Bucket].push_back(&E.second);
+  }
+
+  // Sort the contents of the buckets by hash value so that hash collisions end
+  // up together. Stable sort makes testing easier and doesn't cost much more.
+  for (HashList &Bucket : Buckets) {
+    llvm::stable_sort(Bucket, [](const HashData *LHS, const HashData *RHS) {
+      return LHS->HashValue < RHS->HashValue;
+    });
+    for (HashData *H : Bucket)
+      llvm::stable_sort(H->Values, [](const BOLTDWARF5AccelTableData *LHS,
+                                      const BOLTDWARF5AccelTableData *RHS) {
+        return LHS->getDieOffset() < RHS->getDieOffset();
+      });
+  }
+
+  CUIndexForm = DIEInteger::BestForm(/*IsSigned*/ false, CUList.size() - 1);
+  TUIndexForm = DIEInteger::BestForm(
+      /*IsSigned*/ false, LocalTUList.size() + ForeignTUList.size() - 1);
+  const dwarf::FormParams FormParams{5, 4, dwarf::DwarfFormat::DWARF32, false};
+  CUIndexEncodingSize = *dwarf::getFixedFormByteSize(CUIndexForm, FormParams);
+  TUIndexEncodingSize = *dwarf::getFixedFormByteSize(TUIndexForm, FormParams);
+}
+
+std::optional<DWARF5AccelTable::UnitIndexAndEncoding>
+DWARF5AcceleratorTable::getIndexForEntry(
+    const BOLTDWARF5AccelTableData &Value) const {
+  if (Value.isTU())
+    return {{Value.getUnitID(), {dwarf::DW_IDX_type_unit, TUIndexForm}}};
+  if (CUList.size() > 1)
+    return {{Value.getUnitID(), {dwarf::DW_IDX_compile_unit, CUIndexForm}}};
+  return std::nullopt;
+}
+
+std::optional<DWARF5AccelTable::UnitIndexAndEncoding>
+DWARF5AcceleratorTable::getSecondIndexForEntry(
+    const BOLTDWARF5AccelTableData &Value) const {
+  if (Value.isTU() && CUList.size() > 1 && Value.getSecondUnitID())
+    return {
+        {*Value.getSecondUnitID(), {dwarf::DW_IDX_compile_unit, CUIndexForm}}};
+  return std::nullopt;
+}
+
+void DWARF5AcceleratorTable::populateAbbrevsMap() {
+  for (auto &Bucket : getBuckets()) {
+    for (DWARF5AcceleratorTable::HashData *Hash : Bucket) {
+      for (BOLTDWARF5AccelTableData *Value : Hash->Values) {
+        const std::optional<DWARF5AccelTable::UnitIndexAndEncoding> EntryRet =
+            getIndexForEntry(*Value);
+        // For entries that need to refer to the foreign type units and to
+        // the CU.
+        const std::optional<DWARF5AccelTable::UnitIndexAndEncoding>
+            SecondEntryRet = getSecondIndexForEntry(*Value);
+        DebugNamesAbbrev Abbrev(Value->getDieTag());
+        if (EntryRet)
+          Abbrev.addAttribute(EntryRet->Encoding);
+        if (SecondEntryRet)
+          Abbrev.addAttribute(SecondEntryRet->Encoding);
+        Abbrev.addAttribute({dwarf::DW_IDX_die_offset, dwarf::DW_FORM_ref4});
+        FoldingSetNodeID ID;
+        Abbrev.Profile(ID);
+        void *InsertPos;
+        if (DebugNamesAbbrev *Existing =
+                AbbreviationsSet.FindNodeOrInsertPos(ID, InsertPos)) {
+          Value->setAbbrevNumber(Existing->getNumber());
+          continue;
+        }
+        DebugNamesAbbrev *NewAbbrev =
+            new (Alloc) DebugNamesAbbrev(std::move(Abbrev));
+        AbbreviationsVector.push_back(NewAbbrev);
+        NewAbbrev->setNumber(AbbreviationsVector.size());
+        AbbreviationsSet.InsertNode(NewAbbrev, InsertPos);
+        Value->setAbbrevNumber(NewAbbrev->getNumber());
+      }
+    }
+  }
+}
+
+void DWARF5AcceleratorTable::writeEntry(const BOLTDWARF5AccelTableData &Entry) {
+  const std::optional<DWARF5AccelTable::UnitIndexAndEncoding> EntryRet =
+      getIndexForEntry(Entry);
+  // For forgeign type (FTU) units that need to refer to the FTU and to the CU.
+  const std::optional<DWARF5AccelTable::UnitIndexAndEncoding> SecondEntryRet =
+      getSecondIndexForEntry(Entry);
+  const unsigned AbbrevIndex = Entry.getAbbrevNumber() - 1;
+  assert(AbbrevIndex < AbbreviationsVector.size() &&
+         "Entry abbrev index is outside of abbreviations vector range.");
+  const DebugNamesAbbrev *Abbrev = AbbreviationsVector[AbbrevIndex];
+  encodeULEB128(Entry.getAbbrevNumber(), *Entriestream);
+  auto writeIndex = [&](uint32_t Index, uint32_t IndexSize) -> void {
+    switch (IndexSize) {
+    default:
+      llvm_unreachable("Unsupported Index Size!");
+      break;
+    case 1:
+      support::endian::write(*Entriestream, static_cast<uint8_t>(Index),
+                             llvm::endianness::little);
+      break;
+    case 2:
+      support::endian::write(*Entriestream, static_cast<uint16_t>(Index),
+                             llvm::endianness::little);
+      break;
+    case 4:
+      support::endian::write(*Entriestream, static_cast<uint32_t>(Index),
+                             llvm::endianness::little);
+      break;
+    };
+  };
+
+  for (const DebugNamesAbbrev::AttributeEncoding &AttrEnc :
+       Abbrev->getAttributes()) {
+    switch (AttrEnc.Index) {
+    default: {
+      llvm_unreachable("Unexpected index attribute!");
+      break;
+    }
+    case dwarf::DW_IDX_compile_unit: {
+      const unsigned CUIndex =
+          SecondEntryRet ? SecondEntryRet->Index : EntryRet->Index;
+      writeIndex(CUIndex, CUIndexEncodingSize);
+      break;
+    }
+    case dwarf::DW_IDX_type_unit: {
+      writeIndex(EntryRet->Index, TUIndexEncodingSize);
+      break;
+    }
+    case dwarf::DW_IDX_die_offset: {
+      assert(AttrEnc.Form == dwarf::DW_FORM_ref4);
+      support::endian::write(*Entriestream,
+                             static_cast<uint32_t>(Entry.getDieOffset()),
+                             llvm::endianness::little);
+      break;
+    }
+    }
+  }
+}
+
+void DWARF5AcceleratorTable::writeEntries() {
+  for (auto &Bucket : getBuckets()) {
+    for (DWARF5AcceleratorTable::HashData *Hash : Bucket) {
+      Hash->EntryOffset = EntriesBuffer->size();
+      for (const BOLTDWARF5AccelTableData *Value : Hash->Values) {
+        writeEntry(*Value);
+      }
+      support::endian::write(*Entriestream, static_cast<uint8_t>(0),
+                             llvm::endianness::little);
+    }
+  }
+}
+
+void DWARF5AcceleratorTable::writeAugmentationString() {
+  // String needs to be multiple of 4 bytes.
+  *AugStringtream << "BOLT";
+  AugmentationStringSize = AugStringBuffer->size();
+}
+
+/// Calculates size of .debug_names header without Length field.
+static constexpr uint32_t getDebugNamesHeaderSize() {
+  constexpr uint16_t VersionLength = sizeof(uint16_t);
+  constexpr uint16_t PaddingLength = sizeof(uint16_t);
+  constexpr uint32_t CompUnitCountLength = sizeof(uint32_t);
+  constexpr uint32_t LocalTypeUnitCountLength = sizeof(uint32_t);
+  constexpr uint32_t ForeignTypeUnitCountLength = sizeof(uint32_t);
+  constexpr uint32_t BucketCountLength = sizeof(uint32_t);
+  constexpr uint32_t NameCountLength = sizeof(uint32_t);
+  constexpr uint32_t AbbrevTableSizeLength = sizeof(uint32_t);
+  constexpr uint32_t AugmentationStringSizeLenght = sizeof(uint32_t);
+  return VersionLength + PaddingLength + CompUnitCountLength +
+         LocalTypeUnitCountLength + ForeignTypeUnitCountLength +
+         BucketCountLength + NameCountLength + AbbrevTableSizeLength +
+         AugmentationStringSizeLenght;
+}
+
+void DWARF5AcceleratorTable::emitHeader() const {
+  constexpr uint32_t HeaderSize = getDebugNamesHeaderSize();
+  // Header Length
+  support::endian::write(*FullTableStream,
+                         static_cast<uint32_t>(HeaderSize + StrBuffer->size() +
+                                               AugmentationStringSize),
+                         llvm::endianness::little);
+  // Version
+  support::endian::write(*FullTableStream, static_cast<uint16_t>(5),
+                         llvm::endianness::little);
+  // Padding
+  support::endian::write(*FullTableStream, static_cast<uint16_t>(0),
+                         llvm::endianness::little);
+  // Compilation Unit Count
+  support::endian::write(*FullTableStream, static_cast<uint32_t>(CUList.size()),
+                         llvm::endianness::little);
+  // Local Type Unit Count
+  support::endian::write(*FullTableStream,
+                         static_cast<uint32_t>(LocalTUList.size()),
+                         llvm::endianness::little);
+  // Foreign Type Unit Count
+  support::endian::write(*FullTableStream,
+                         static_cast<uint32_t>(ForeignTUList.size()),
+                         llvm::endianness::little);
+  // Bucket Count
+  support::endian::write(*FullTableStream, static_cast<uint32_t>(BucketCount),
+                         llvm::endianness::little);
+  // Name Count
+  support::endian::write(*FullTableStream,
+                         static_cast<uint32_t>(Entries.size()),
+                         llvm::endianness::little);
+  // Abbrev Table Size
+  support::endian::write(*FullTableStream,
+                         static_cast<uint32_t>(AbbrevTableSize),
+                         llvm::endianness::little);
+  // Augmentation String Size
+  support::endian::write(*FullTableStream,
+                         static_cast<uint32_t>(AugmentationStringSize),
+                         llvm::endianness::little);
+
+  emitAugmentationString();
+  FullTableStream->write(StrBuffer->data(), StrBuffer->size());
+}
+
+void DWARF5AcceleratorTable::emitCUList() const {
+  for (const uint32_t CUID : CUList)
+    support::endian::write(*StrStream, CUID, llvm::endianness::little);
+}
+void DWARF5AcceleratorTable::emitTUList() const {
+  for (const uint32_t TUID : LocalTUList)
+    support::endian::write(*StrStream, TUID, llvm::endianness::little);
+
+  for (const uint64_t TUID : ForeignTUList)
+    support::endian::write(*StrStream, TUID, llvm::endianness::little);
+}
+void DWARF5AcceleratorTable::emitBuckets() const {
+  uint32_t Index = 1;
+  for (const auto &Bucket : enumerate(getBuckets())) {
+    const uint32_t TempIndex = Bucket.value().empty() ? 0 : Index;
+    support::endian::write(*StrStream, TempIndex, llvm::endianness::little);
+    Index += Bucket.value().size();
+  }
+}
+void DWARF5AcceleratorTable::emitHashes() const {
+  for (const auto &Bucket : getBuckets()) {
+    for (const DWARF5AcceleratorTable::HashData *Hash : Bucket)
+      support::endian::write(*StrStream, Hash->HashValue,
+                             llvm::endianness::little);
+  }
+}
+void DWARF5AcceleratorTable::emitStringOffsets() const {
+  for (const auto &Bucket : getBuckets()) {
+    for (const DWARF5AcceleratorTable::HashData *Hash : Bucket)
+      support::endian::write(*StrStream, static_cast<uint32_t>(Hash->StrOffset),
+                             llvm::endianness::little);
+  }
+}
+void DWARF5AcceleratorTable::emitOffsets() const {
+  for (const auto &Bucket : getBuckets()) {
+    for (const DWARF5AcceleratorTable::HashData *Hash : Bucket)
+      support::endian::write(*StrStream,
+                             static_cast<uint32_t>(Hash->EntryOffset),
+                             llvm::endianness::little);
+  }
+}
+void DWARF5AcceleratorTable::emitAbbrevs() {
+  const uint32_t AbbrevTableStart = StrBuffer->size();
+  for (const auto *Abbrev : AbbreviationsVector) {
+    encodeULEB128(Abbrev->getNumber(), *StrStream);
+    encodeULEB128(Abbrev->getDieTag(), *StrStream);
+    for (const auto &AttrEnc : Abbrev->getAttributes()) {
+      encodeULEB128(AttrEnc.Index, *StrStream);
+      encodeULEB128(AttrEnc.Form, *StrStream);
+    }
+    encodeULEB128(0, *StrStream);
+    encodeULEB128(0, *StrStream);
+  }
+  encodeULEB128(0, *StrStream);
+  AbbrevTableSize = StrBuffer->size() - AbbrevTableStart;
+}
+void DWARF5AcceleratorTable::emitData() {
+  StrStream->write(EntriesBuffer->data(), EntriesBuffer->size());
+}
+void DWARF5AcceleratorTable::emitAugmentationString() const {
+  FullTableStream->write(AugStringBuffer->data(), AugStringBuffer->size());
+}
+void DWARF5AcceleratorTable::emitAccelTable() {
+  if (!NeedToCreate)
+    return;
+  finalize();
+  populateAbbrevsMap();
+  writeEntries();
+  writeAugmentationString();
+  emitCUList();
+  emitTUList();
+  emitBuckets();
+  emitHashes();
+  emitStringOffsets();
+  emitOffsets();
+  emitAbbrevs();
+  emitData();
+  emitHeader();
+}
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/lib/Core/Exceptions.cpp b/bolt/lib/Core/Exceptions.cpp
index 54618aeb95cc..82bddf76d5b8 100644
--- a/bolt/lib/Core/Exceptions.cpp
+++ b/bolt/lib/Core/Exceptions.cpp
@@ -408,12 +408,11 @@ void BinaryFunction::updateEHRanges() {
 
         // Same symbol is used for the beginning and the end of the range.
         MCSymbol *EHSymbol;
-        if (MCSymbol *InstrLabel = BC.MIB->getLabel(Instr)) {
+        if (MCSymbol *InstrLabel = BC.MIB->getInstLabel(Instr)) {
           EHSymbol = InstrLabel;
         } else {
           std::unique_lock<llvm::sys::RWMutex> Lock(BC.CtxMutex);
-          EHSymbol = BC.Ctx->createNamedTempSymbol("EH");
-          BC.MIB->setLabel(Instr, EHSymbol);
+          EHSymbol = BC.MIB->getOrCreateInstLabel(Instr, "EH", BC.Ctx.get());
         }
 
         // At this point we could be in one of the following states:
diff --git a/bolt/lib/Core/MCPlusBuilder.cpp b/bolt/lib/Core/MCPlusBuilder.cpp
index 44e5f88d8950..bd9bd0c45922 100644
--- a/bolt/lib/Core/MCPlusBuilder.cpp
+++ b/bolt/lib/Core/MCPlusBuilder.cpp
@@ -12,6 +12,7 @@
 
 #include "bolt/Core/MCPlusBuilder.h"
 #include "bolt/Core/MCPlus.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -266,17 +267,29 @@ bool MCPlusBuilder::clearOffset(MCInst &Inst) const {
   return true;
 }
 
-MCSymbol *MCPlusBuilder::getLabel(const MCInst &Inst) const {
+MCSymbol *MCPlusBuilder::getInstLabel(const MCInst &Inst) const {
   if (std::optional<int64_t> Label =
           getAnnotationOpValue(Inst, MCAnnotation::kLabel))
     return reinterpret_cast<MCSymbol *>(*Label);
   return nullptr;
 }
 
-bool MCPlusBuilder::setLabel(MCInst &Inst, MCSymbol *Label) const {
+MCSymbol *MCPlusBuilder::getOrCreateInstLabel(MCInst &Inst, const Twine &Name,
+                                              MCContext *Ctx) const {
+  MCSymbol *Label = getInstLabel(Inst);
+  if (Label)
+    return Label;
+
+  Label = Ctx->createNamedTempSymbol(Name);
+  setAnnotationOpValue(Inst, MCAnnotation::kLabel,
+                       reinterpret_cast<int64_t>(Label));
+  return Label;
+}
+
+void MCPlusBuilder::setInstLabel(MCInst &Inst, MCSymbol *Label) const {
+  assert(!getInstLabel(Inst) && "Instruction already has assigned label.");
   setAnnotationOpValue(Inst, MCAnnotation::kLabel,
                        reinterpret_cast<int64_t>(Label));
-  return true;
 }
 
 std::optional<uint32_t> MCPlusBuilder::getSize(const MCInst &Inst) const {
diff --git a/bolt/lib/Passes/Instrumentation.cpp b/bolt/lib/Passes/Instrumentation.cpp
index 760ca84b4ef1..68acff7e6a86 100644
--- a/bolt/lib/Passes/Instrumentation.cpp
+++ b/bolt/lib/Passes/Instrumentation.cpp
@@ -17,7 +17,9 @@
 #include "bolt/Utils/Utils.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/RWMutex.h"
+#include <queue>
 #include <stack>
+#include <unordered_set>
 
 #define DEBUG_TYPE "bolt-instrumentation"
 
@@ -86,21 +88,89 @@ cl::opt<bool> InstrumentCalls("instrument-calls",
 namespace llvm {
 namespace bolt {
 
-static bool hasAArch64ExclusiveMemop(BinaryFunction &Function) {
+static bool hasAArch64ExclusiveMemop(
+    BinaryFunction &Function,
+    std::unordered_set<const BinaryBasicBlock *> &BBToSkip) {
   // FIXME ARMv8-a architecture reference manual says that software must avoid
   // having any explicit memory accesses between exclusive load and associated
-  // store instruction. So for now skip instrumentation for functions that have
-  // these instructions, since it might lead to runtime deadlock.
+  // store instruction. So for now skip instrumentation for basic blocks that
+  // have these instructions, since it might lead to runtime deadlock.
   BinaryContext &BC = Function.getBinaryContext();
-  for (const BinaryBasicBlock &BB : Function)
-    for (const MCInst &Inst : BB)
-      if (BC.MIB->isAArch64Exclusive(Inst)) {
-        if (opts::Verbosity >= 1)
-          BC.outs() << "BOLT-INSTRUMENTER: Function " << Function
-                    << " has exclusive instructions, skip instrumentation\n";
+  std::queue<std::pair<BinaryBasicBlock *, bool>> BBQueue; // {BB, isLoad}
+  std::unordered_set<BinaryBasicBlock *> Visited;
+
+  if (Function.getLayout().block_begin() == Function.getLayout().block_end())
+    return 0;
+
+  BinaryBasicBlock *BBfirst = *Function.getLayout().block_begin();
+  BBQueue.push({BBfirst, false});
+
+  while (!BBQueue.empty()) {
+    BinaryBasicBlock *BB = BBQueue.front().first;
+    bool IsLoad = BBQueue.front().second;
+    BBQueue.pop();
+    if (Visited.find(BB) != Visited.end())
+      continue;
+    Visited.insert(BB);
+
+    for (const MCInst &Inst : *BB) {
+      // Two loads one after another - skip whole function
+      if (BC.MIB->isAArch64ExclusiveLoad(Inst) && IsLoad) {
+        if (opts::Verbosity >= 2) {
+          outs() << "BOLT-INSTRUMENTER: function " << Function.getPrintName()
+                 << " has two exclusive loads. Ignoring the function.\n";
+        }
         return true;
       }
 
+      if (BC.MIB->isAArch64ExclusiveLoad(Inst))
+        IsLoad = true;
+
+      if (IsLoad && BBToSkip.find(BB) == BBToSkip.end()) {
+        BBToSkip.insert(BB);
+        if (opts::Verbosity >= 2) {
+          outs() << "BOLT-INSTRUMENTER: skip BB " << BB->getName()
+                 << " due to exclusive instruction in function "
+                 << Function.getPrintName() << "\n";
+        }
+      }
+
+      if (!IsLoad && BC.MIB->isAArch64ExclusiveStore(Inst)) {
+        if (opts::Verbosity >= 2) {
+          outs() << "BOLT-INSTRUMENTER: function " << Function.getPrintName()
+                 << " has exclusive store without corresponding load. Ignoring "
+                    "the function.\n";
+        }
+        return true;
+      }
+
+      if (IsLoad && (BC.MIB->isAArch64ExclusiveStore(Inst) ||
+                     BC.MIB->isAArch64ExclusiveClear(Inst)))
+        IsLoad = false;
+    }
+
+    if (IsLoad && BB->succ_size() == 0) {
+      if (opts::Verbosity >= 2) {
+        outs()
+            << "BOLT-INSTRUMENTER: function " << Function.getPrintName()
+            << " has exclusive load in trailing BB. Ignoring the function.\n";
+      }
+      return true;
+    }
+
+    for (BinaryBasicBlock *BBS : BB->successors())
+      BBQueue.push({BBS, IsLoad});
+  }
+
+  if (BBToSkip.size() == Visited.size()) {
+    if (opts::Verbosity >= 2) {
+      outs() << "BOLT-INSTRUMENTER: all BBs are marked with true. Ignoring the "
+                "function "
+             << Function.getPrintName() << "\n";
+    }
+    return true;
+  }
+
   return false;
 }
 
@@ -307,7 +377,8 @@ void Instrumentation::instrumentFunction(BinaryFunction &Function,
   if (BC.isMachO() && Function.hasName("___GLOBAL_init_65535/1"))
     return;
 
-  if (BC.isAArch64() && hasAArch64ExclusiveMemop(Function))
+  std::unordered_set<const BinaryBasicBlock *> BBToSkip;
+  if (BC.isAArch64() && hasAArch64ExclusiveMemop(Function, BBToSkip))
     return;
 
   SplitWorklistTy SplitWorklist;
@@ -389,6 +460,11 @@ void Instrumentation::instrumentFunction(BinaryFunction &Function,
 
   for (auto BBI = Function.begin(), BBE = Function.end(); BBI != BBE; ++BBI) {
     BinaryBasicBlock &BB = *BBI;
+
+    // Skip BBs with exclusive load/stores
+    if (BBToSkip.find(&BB) != BBToSkip.end())
+      continue;
+
     bool HasUnconditionalBranch = false;
     bool HasJumpTable = false;
     bool IsInvokeBlock = InvokeBlocks.count(&BB) > 0;
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 849c363730eb..ca9d24245ceb 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -347,6 +347,12 @@ static cl::opt<bool>
                       "multiple non-relocatable dwarf object files (dwo)."),
              cl::init(false), cl::cat(BoltCategory));
 
+static cl::opt<bool> CreateDebugNames(
+    "create-debug-names-section",
+    cl::desc("Creates .debug_names section, if the input binary doesn't have "
+             "it already, for DWARF5 CU/TUs."),
+    cl::init(false), cl::cat(BoltCategory));
+
 static cl::opt<bool>
     DebugSkeletonCu("debug-skeleton-cu",
                     cl::desc("prints out offsetrs for abbrev and debu_info of "
@@ -692,6 +698,8 @@ void DWARFRewriter::updateDebugInfo() {
     return ObjectName;
   };
 
+  DWARF5AcceleratorTable DebugNamesTable(opts::CreateDebugNames, BC,
+                                         *StrWriter);
   DWPState State;
   if (opts::WriteDWP)
     initDWPState(State);
@@ -709,7 +717,8 @@ void DWARFRewriter::updateDebugInfo() {
                                 : LegacyRangesSectionWriter.get();
     // Skipping CUs that failed to load.
     if (SplitCU) {
-      DIEBuilder DWODIEBuilder(BC, &(*SplitCU)->getContext(), Unit);
+      DIEBuilder DWODIEBuilder(BC, &(*SplitCU)->getContext(), DebugNamesTable,
+                               Unit);
       DWODIEBuilder.buildDWOUnit(**SplitCU);
       std::string DWOName = updateDWONameCompDir(
           *Unit, *DIEBlder, *DIEBlder->getUnitDIEbyUnit(*Unit));
@@ -749,7 +758,7 @@ void DWARFRewriter::updateDebugInfo() {
     AddrWriter->update(*DIEBlder, *Unit);
   };
 
-  DIEBuilder DIEBlder(BC, BC.DwCtx.get());
+  DIEBuilder DIEBlder(BC, BC.DwCtx.get(), DebugNamesTable);
   DIEBlder.buildTypeUnits(StrOffstsWriter.get());
   SmallVector<char, 20> OutBuffer;
   std::unique_ptr<raw_svector_ostream> ObjOS =
@@ -781,10 +790,13 @@ void DWARFRewriter::updateDebugInfo() {
     ThreadPool.wait();
   }
 
+  DebugNamesTable.emitAccelTable();
+
   if (opts::WriteDWP)
     finalizeDWP(State);
 
-  finalizeDebugSections(DIEBlder, *Streamer, *ObjOS, OffsetMap);
+  finalizeDebugSections(DIEBlder, DebugNamesTable, *Streamer, *ObjOS,
+                        OffsetMap);
   updateGdbIndexSection(OffsetMap, CUIndex);
 }
 
@@ -1515,10 +1527,9 @@ CUOffsetMap DWARFRewriter::finalizeTypeSections(DIEBuilder &DIEBlder,
   return CUMap;
 }
 
-void DWARFRewriter::finalizeDebugSections(DIEBuilder &DIEBlder,
-                                          DIEStreamer &Streamer,
-                                          raw_svector_ostream &ObjOS,
-                                          CUOffsetMap &CUMap) {
+void DWARFRewriter::finalizeDebugSections(
+    DIEBuilder &DIEBlder, DWARF5AcceleratorTable &DebugNamesTable,
+    DIEStreamer &Streamer, raw_svector_ostream &ObjOS, CUOffsetMap &CUMap) {
   if (StrWriter->isInitialized()) {
     RewriteInstance::addToDebugSectionsToOverwrite(".debug_str");
     std::unique_ptr<DebugStrBufferVector> DebugStrSectionContents =
@@ -1616,6 +1627,15 @@ void DWARFRewriter::finalizeDebugSections(DIEBuilder &DIEBlder,
                                    copyByteArray(ARangesContents),
                                    ARangesContents.size());
   }
+
+  if (DebugNamesTable.isCreated()) {
+    RewriteInstance::addToDebugSectionsToOverwrite(".debug_names");
+    std::unique_ptr<DebugBufferVector> DebugNamesSectionContents =
+        DebugNamesTable.releaseBuffer();
+    BC.registerOrUpdateNoteSection(".debug_names",
+                                   copyByteArray(*DebugNamesSectionContents),
+                                   DebugNamesSectionContents->size());
+  }
 }
 
 void DWARFRewriter::finalizeCompileUnits(DIEBuilder &DIEBlder,
diff --git a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
index 6377c1197253..0d7dc1070ce7 100644
--- a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
+++ b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
@@ -770,11 +770,8 @@ Error LinuxKernelRewriter::rewriteORCTables() {
           continue;
 
         // Issue label for the instruction.
-        MCSymbol *Label = BC.MIB->getLabel(Inst);
-        if (!Label) {
-          Label = BC.Ctx->createTempSymbol("__ORC_");
-          BC.MIB->setLabel(Inst, Label);
-        }
+        MCSymbol *Label =
+            BC.MIB->getOrCreateInstLabel(Inst, "__ORC_", BC.Ctx.get());
 
         if (Error E = emitORCEntry(0, *ErrorOrState, Label))
           return E;
@@ -908,11 +905,8 @@ Error LinuxKernelRewriter::readStaticCalls() {
 
     BC.MIB->addAnnotation(*Inst, "StaticCall", EntryID);
 
-    MCSymbol *Label = BC.MIB->getLabel(*Inst);
-    if (!Label) {
-      Label = BC.Ctx->createTempSymbol("__SC_");
-      BC.MIB->setLabel(*Inst, Label);
-    }
+    MCSymbol *Label =
+        BC.MIB->getOrCreateInstLabel(*Inst, "__SC_", BC.Ctx.get());
 
     StaticCallEntries.push_back({EntryID, BF, Label});
   }
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index d90512e21225..c1c09c70ab01 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -270,32 +270,38 @@ public:
     return isLDRB(Inst) || isLDRH(Inst) || isLDRW(Inst) || isLDRX(Inst);
   }
 
-  bool isAArch64Exclusive(const MCInst &Inst) const override {
+  bool isAArch64ExclusiveLoad(const MCInst &Inst) const override {
     return (Inst.getOpcode() == AArch64::LDXPX ||
             Inst.getOpcode() == AArch64::LDXPW ||
             Inst.getOpcode() == AArch64::LDXRX ||
             Inst.getOpcode() == AArch64::LDXRW ||
             Inst.getOpcode() == AArch64::LDXRH ||
             Inst.getOpcode() == AArch64::LDXRB ||
-            Inst.getOpcode() == AArch64::STXPX ||
-            Inst.getOpcode() == AArch64::STXPW ||
-            Inst.getOpcode() == AArch64::STXRX ||
-            Inst.getOpcode() == AArch64::STXRW ||
-            Inst.getOpcode() == AArch64::STXRH ||
-            Inst.getOpcode() == AArch64::STXRB ||
             Inst.getOpcode() == AArch64::LDAXPX ||
             Inst.getOpcode() == AArch64::LDAXPW ||
             Inst.getOpcode() == AArch64::LDAXRX ||
             Inst.getOpcode() == AArch64::LDAXRW ||
             Inst.getOpcode() == AArch64::LDAXRH ||
-            Inst.getOpcode() == AArch64::LDAXRB ||
+            Inst.getOpcode() == AArch64::LDAXRB);
+  }
+
+  bool isAArch64ExclusiveStore(const MCInst &Inst) const override {
+    return (Inst.getOpcode() == AArch64::STXPX ||
+            Inst.getOpcode() == AArch64::STXPW ||
+            Inst.getOpcode() == AArch64::STXRX ||
+            Inst.getOpcode() == AArch64::STXRW ||
+            Inst.getOpcode() == AArch64::STXRH ||
+            Inst.getOpcode() == AArch64::STXRB ||
             Inst.getOpcode() == AArch64::STLXPX ||
             Inst.getOpcode() == AArch64::STLXPW ||
             Inst.getOpcode() == AArch64::STLXRX ||
             Inst.getOpcode() == AArch64::STLXRW ||
             Inst.getOpcode() == AArch64::STLXRH ||
-            Inst.getOpcode() == AArch64::STLXRB ||
-            Inst.getOpcode() == AArch64::CLREX);
+            Inst.getOpcode() == AArch64::STLXRB);
+  }
+
+  bool isAArch64ExclusiveClear(const MCInst &Inst) const override {
+    return (Inst.getOpcode() == AArch64::CLREX);
   }
 
   bool isLoadFromStack(const MCInst &Inst) const {
diff --git a/bolt/test/AArch64/exclusive-instrument.s b/bolt/test/AArch64/exclusive-instrument.s
index 502dd83b2f2a..69cc0707aba8 100644
--- a/bolt/test/AArch64/exclusive-instrument.s
+++ b/bolt/test/AArch64/exclusive-instrument.s
@@ -6,32 +6,108 @@
 // RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \
 // RUN:   %s -o %t.o
 // RUN: %clang %cflags -fPIC -pie %t.o -o %t.exe -nostdlib -Wl,-q -Wl,-fini=dummy
-// RUN: llvm-bolt %t.exe -o %t.bolt -instrument -v=1 | FileCheck %s
+// RUN: llvm-bolt %t.exe -o %t.bolt -instrument -v=2 | FileCheck %s
 
-// CHECK: Function foo has exclusive instructions, skip instrumentation
+// CHECK: BOLT-INSTRUMENTER: skip BB {{.*}} due to exclusive instruction in function foo
+// CHECK: BOLT-INSTRUMENTER: skip BB {{.*}} due to exclusive instruction in function foo
+// CHECK: BOLT-INSTRUMENTER: skip BB {{.*}} due to exclusive instruction in function foo
+// CHECK: BOLT-INSTRUMENTER: skip BB {{.*}} due to exclusive instruction in function case1
+// CHECK: BOLT-INSTRUMENTER: skip BB {{.*}} due to exclusive instruction in function case2
+// CHECK: BOLT-INSTRUMENTER: skip BB {{.*}} due to exclusive instruction in function case2
+// CHECK: BOLT-INSTRUMENTER: function case3 has exclusive store without corresponding load. Ignoring the function.
+// CHECK: BOLT-INSTRUMENTER: skip BB {{.*}} due to exclusive instruction in function case4
+// CHECK: BOLT-INSTRUMENTER: function case4 has two exclusive loads. Ignoring the function.
+// CHECK: BOLT-INSTRUMENTER: skip BB {{.*}} due to exclusive instruction in function case5
+// CHECK: BOLT-INSTRUMENTER: function case5 has exclusive load in trailing BB. Ignoring the function.
 
 .global foo
 .type foo, %function
 foo:
+  # exclusive load and store in two bbs
   ldaxr w9, [x10]
   cbnz w9, .Lret
   stlxr w12, w11, [x9]
   cbz w12, foo
-  clrex
 .Lret:
+  clrex
   ret
 .size foo, .-foo
 
 .global _start
 .type _start, %function
 _start:
-  cmp x0, #0
-  b.eq .Lexit
-  bl foo
-.Lexit:
+  mov x0, #0
+  mov x1, #1
+  mov x2, #2
+  mov x3, #3
+
+  bl case1
+  bl case2
+  bl case3
+  bl case4
+  bl case5
+
   ret
 .size _start, .-_start
 
+# Case 1: exclusive load and store in one basic block
+.global case1
+.type case1, %function
+case1:
+  str  x0, [x2]
+  ldxr w0, [x2]
+  add  w0, w0, #1
+  stxr w1, w0, [x2]
+  ret
+.size case1, .-case1
+
+# Case 2: exclusive load and store in different blocks
+.global case2
+.type case2, %function
+case2:
+  b    case2_load
+
+case2_load:
+  ldxr x0, [x2]
+  b    case2_store
+
+case2_store:
+  add  x0, x0, #1
+  stxr w1, x0, [x2]
+  ret
+.size case2, .-case2
+
+# Case 3: store without preceding load
+.global case3
+.type case3, %function
+case3:
+  stxr w1, x3, [x2]
+  ret
+.size case3, .-case3
+
+# Case 4: two exclusive load instructions in neighboring blocks
+.global case4
+.type case4, %function
+case4:
+  b    case4_load
+
+case4_load:
+  ldxr x0, [x2]
+  b    case4_load_next
+
+case4_load_next:
+  ldxr x1, [x2]
+  ret
+.size case4, .-case4
+
+# Case 5:  Exclusive load without successor
+.global case5
+.type case5, %function
+case5:
+  ldxr x0, [x2]
+  ret
+.size case5, .-case5
+
 .global dummy
 .type dummy, %function
 dummy:
diff --git a/bolt/test/X86/Inputs/dwarf5-debug-names-helper.s b/bolt/test/X86/Inputs/dwarf5-debug-names-helper.s
new file mode 100644
index 000000000000..f10417bd0ceb
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-debug-names-helper.s
@@ -0,0 +1,487 @@
+# clang++ -g2 -gdwarf-5 -gpubnames -fdebug-types-section -S
+# header.h
+# struct Foo2a {
+#   char *c1;
+#   char *c2;
+#   char *c3;
+# };
+# helper.cpp
+# #include "header.h"
+# int fooint;
+# struct Foo2Int {
+#    int *c1;
+#    int *c2;
+# };
+#
+# int foo() {
+#   Foo2Int fint;
+#   Foo2a f;
+#   return 0;
+# }
+
+	.text
+	.file	"helper.cpp"
+	.file	0 "/typeDedup" "helper.cpp" md5 0xc33186b2db66a78883b1546aace9855d
+	.globl	_Z3foov                         # -- Begin function _Z3foov
+	.p2align	4, 0x90
+	.type	_Z3foov,@function
+_Z3foov:                                # @_Z3foov
+.Lfunc_begin0:
+	.loc	0 8 0                           # helper.cpp:8:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+.Ltmp0:
+	.loc	0 11 3 prologue_end             # helper.cpp:11:3
+	xorl	%eax, %eax
+	.loc	0 11 3 epilogue_begin is_stmt 0 # helper.cpp:11:3
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	_Z3foov, .Lfunc_end0-_Z3foov
+	.cfi_endproc
+                                        # -- End function
+	.type	fooint,@object                  # @fooint
+	.bss
+	.globl	fooint
+	.p2align	2, 0x0
+fooint:
+	.long	0                               # 0x0
+	.size	fooint, 4
+
+	.file	1 "." "header.h" md5 0xfea7bb1f22c47f129e15695f7137a1e7
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	110                             # DW_AT_linkage_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	15                              # DW_TAG_pointer_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	1                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	1                               # Abbrev [1] 0xc:0x97 DW_TAG_compile_unit
+	.byte	0                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	1                               # DW_AT_name
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # DW_AT_comp_dir
+	.byte	1                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+	.byte	2                               # Abbrev [2] 0x23:0xb DW_TAG_variable
+	.byte	3                               # DW_AT_name
+	.long	46                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	2                               # DW_AT_location
+	.byte	161
+	.byte	0
+	.byte	3                               # Abbrev [3] 0x2e:0x4 DW_TAG_base_type
+	.byte	4                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	4                               # Abbrev [4] 0x32:0x27 DW_TAG_subprogram
+	.byte	1                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	5                               # DW_AT_linkage_name
+	.byte	6                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	8                               # DW_AT_decl_line
+	.long	46                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	5                               # Abbrev [5] 0x42:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	112
+	.byte	7                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	9                               # DW_AT_decl_line
+	.long	89                              # DW_AT_type
+	.byte	5                               # Abbrev [5] 0x4d:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	88
+	.byte	11                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	10                              # DW_AT_decl_line
+	.long	119                             # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	6                               # Abbrev [6] 0x59:0x19 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	10                              # DW_AT_name
+	.byte	16                              # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	7                               # Abbrev [7] 0x5f:0x9 DW_TAG_member
+	.byte	8                               # DW_AT_name
+	.long	114                             # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	4                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	7                               # Abbrev [7] 0x68:0x9 DW_TAG_member
+	.byte	9                               # DW_AT_name
+	.long	114                             # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.byte	8                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	8                               # Abbrev [8] 0x72:0x5 DW_TAG_pointer_type
+	.long	46                              # DW_AT_type
+	.byte	6                               # Abbrev [6] 0x77:0x22 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	14                              # DW_AT_name
+	.byte	24                              # DW_AT_byte_size
+	.byte	1                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.byte	7                               # Abbrev [7] 0x7d:0x9 DW_TAG_member
+	.byte	8                               # DW_AT_name
+	.long	153                             # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	7                               # Abbrev [7] 0x86:0x9 DW_TAG_member
+	.byte	9                               # DW_AT_name
+	.long	153                             # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	8                               # DW_AT_data_member_location
+	.byte	7                               # Abbrev [7] 0x8f:0x9 DW_TAG_member
+	.byte	13                              # DW_AT_name
+	.long	153                             # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	4                               # DW_AT_decl_line
+	.byte	16                              # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	8                               # Abbrev [8] 0x99:0x5 DW_TAG_pointer_type
+	.long	158                             # DW_AT_type
+	.byte	3                               # Abbrev [3] 0x9e:0x4 DW_TAG_base_type
+	.byte	12                              # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	64                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 18.0.0git"       # string offset=0
+.Linfo_string1:
+	.asciz	"helper.cpp"                    # string offset=24
+.Linfo_string2:
+	.asciz	"/home/ayermolo/local/tasks/T138552329/typeDedup" # string offset=35
+.Linfo_string3:
+	.asciz	"fooint"                        # string offset=83
+.Linfo_string4:
+	.asciz	"int"                           # string offset=90
+.Linfo_string5:
+	.asciz	"foo"                           # string offset=94
+.Linfo_string6:
+	.asciz	"_Z3foov"                       # string offset=98
+.Linfo_string7:
+	.asciz	"fint"                          # string offset=106
+.Linfo_string8:
+	.asciz	"Foo2Int"                       # string offset=111
+.Linfo_string9:
+	.asciz	"c1"                            # string offset=119
+.Linfo_string10:
+	.asciz	"c2"                            # string offset=122
+.Linfo_string11:
+	.asciz	"f"                             # string offset=125
+.Linfo_string12:
+	.asciz	"Foo2a"                         # string offset=127
+.Linfo_string13:
+	.asciz	"char"                          # string offset=133
+.Linfo_string14:
+	.asciz	"c3"                            # string offset=138
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Linfo_string0
+	.long	.Linfo_string1
+	.long	.Linfo_string2
+	.long	.Linfo_string3
+	.long	.Linfo_string4
+	.long	.Linfo_string6
+	.long	.Linfo_string5
+	.long	.Linfo_string7
+	.long	.Linfo_string9
+	.long	.Linfo_string10
+	.long	.Linfo_string8
+	.long	.Linfo_string11
+	.long	.Linfo_string13
+	.long	.Linfo_string14
+	.long	.Linfo_string12
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	fooint
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	0                               # Header: local type unit count
+	.long	0                               # Header: foreign type unit count
+	.long	7                               # Header: bucket count
+	.long	7                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.long	1                               # Bucket 0
+	.long	0                               # Bucket 1
+	.long	2                               # Bucket 2
+	.long	3                               # Bucket 3
+	.long	0                               # Bucket 4
+	.long	5                               # Bucket 5
+	.long	7                               # Bucket 6
+	.long	-1257882357                     # Hash in Bucket 0
+	.long	-1168750522                     # Hash in Bucket 2
+	.long	193495088                       # Hash in Bucket 3
+	.long	259227804                       # Hash in Bucket 3
+	.long	193491849                       # Hash in Bucket 5
+	.long	2090147939                      # Hash in Bucket 5
+	.long	-35356620                       # Hash in Bucket 6
+	.long	.Linfo_string6                  # String in Bucket 0: _Z3foov
+	.long	.Linfo_string8                  # String in Bucket 2: Foo2Int
+	.long	.Linfo_string4                  # String in Bucket 3: int
+	.long	.Linfo_string12                 # String in Bucket 3: Foo2a
+	.long	.Linfo_string5                  # String in Bucket 5: foo
+	.long	.Linfo_string13                 # String in Bucket 5: char
+	.long	.Linfo_string3                  # String in Bucket 6: fooint
+	.long	.Lnames3-.Lnames_entries0       # Offset in Bucket 0
+	.long	.Lnames4-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 3
+	.long	.Lnames5-.Lnames_entries0       # Offset in Bucket 3
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 5
+	.long	.Lnames6-.Lnames_entries0       # Offset in Bucket 5
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 6
+.Lnames_abbrev_start0:
+	.ascii	"\230."                         # Abbrev code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\230\023"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\230$"                         # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\2304"                         # Abbrev code
+	.byte	52                              # DW_TAG_variable
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames3:
+.L0:
+	.ascii	"\230."                         # Abbreviation code
+	.long	50                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: _Z3foov
+.Lnames4:
+.L5:
+	.ascii	"\230\023"                      # Abbreviation code
+	.long	89                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: Foo2Int
+.Lnames0:
+.L2:
+	.ascii	"\230$"                         # Abbreviation code
+	.long	46                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: int
+.Lnames5:
+.L3:
+	.ascii	"\230\023"                      # Abbreviation code
+	.long	119                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: Foo2a
+.Lnames2:
+	.ascii	"\230."                         # Abbreviation code
+	.long	50                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: foo
+.Lnames6:
+.L1:
+	.ascii	"\230$"                         # Abbreviation code
+	.long	158                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: char
+.Lnames1:
+.L4:
+	.ascii	"\2304"                         # Abbreviation code
+	.long	35                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: fooint
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 18.0.0git"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/dwarf5-debug-names-main.s b/bolt/test/X86/Inputs/dwarf5-debug-names-main.s
new file mode 100644
index 000000000000..2d3f4d2d208f
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-debug-names-main.s
@@ -0,0 +1,712 @@
+# clang++ -g2 -gdwarf-5 -gpubnames -fdebug-types-section
+# header.h
+# struct Foo2a {
+#   char *c1;
+#   char *c2;
+#   char *c3;
+# };
+# main.cpp
+# #include "header.h"
+# extern int fooint;
+# namespace {
+# struct t1 {
+# int i;
+# };
+# }
+# template <int *> struct t2 {
+#   t1 v1;
+# };
+# struct t3 {
+#   t2<&fooint> v1;
+# };
+# t3 v1;
+#
+# struct Foo {
+#  char *c1;
+#  char *c2;
+#  char *c3;
+# };
+# struct Foo2 {
+#  char *c1;
+#  char *c2;
+# };
+# int main(int argc, char *argv[]) {
+#  Foo f;
+#  Foo2 f2;
+#  Foo2a f3;
+#  return 0;
+# }
+	.text
+	.file	"main.cpp"
+	.file	0 "/typeDedup" "main.cpp" md5 0x04e636082b2b8a95a6ca39dde52372ae
+	.globl	main                            # -- Begin function main
+	.p2align	4, 0x90
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.loc	0 25 0                          # main.cpp:25:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	movl	$0, -4(%rbp)
+	movl	%edi, -8(%rbp)
+	movq	%rsi, -16(%rbp)
+.Ltmp0:
+	.loc	0 29 2 prologue_end             # main.cpp:29:2
+	xorl	%eax, %eax
+	.loc	0 29 2 epilogue_begin is_stmt 0 # main.cpp:29:2
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+                                        # -- End function
+	.type	v1,@object                      # @v1
+	.bss
+	.globl	v1
+	.p2align	2, 0x0
+v1:
+	.zero	4
+	.size	v1, 4
+
+	.file	1 "." "header.h" md5 0xfea7bb1f22c47f129e15695f7137a1e7
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	48                              # DW_TAG_template_value_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	15                              # DW_TAG_pointer_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	57                              # DW_TAG_namespace
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	9                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	10                              # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	11                              # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	1                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	1                               # Abbrev [1] 0xc:0x11a DW_TAG_compile_unit
+	.byte	0                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	1                               # DW_AT_name
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # DW_AT_comp_dir
+	.byte	2                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+	.byte	2                               # Abbrev [2] 0x23:0xb DW_TAG_variable
+	.byte	3                               # DW_AT_name
+	.long	46                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	0                               # DW_AT_decl_file
+	.byte	14                              # DW_AT_decl_line
+	.byte	2                               # DW_AT_location
+	.byte	161
+	.byte	1
+	.byte	3                               # Abbrev [3] 0x2e:0x10 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	8                               # DW_AT_name
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	11                              # DW_AT_decl_line
+	.byte	4                               # Abbrev [4] 0x34:0x9 DW_TAG_member
+	.byte	3                               # DW_AT_name
+	.long	62                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	12                              # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	3                               # Abbrev [3] 0x3e:0x19 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	7                               # DW_AT_name
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	8                               # DW_AT_decl_line
+	.byte	5                               # Abbrev [5] 0x44:0x9 DW_TAG_template_value_parameter
+	.long	87                              # DW_AT_type
+	.byte	3                               # DW_AT_location
+	.byte	161
+	.byte	0
+	.byte	159
+	.byte	4                               # Abbrev [4] 0x4d:0x9 DW_TAG_member
+	.byte	3                               # DW_AT_name
+	.long	97                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	9                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	6                               # Abbrev [6] 0x57:0x5 DW_TAG_pointer_type
+	.long	92                              # DW_AT_type
+	.byte	7                               # Abbrev [7] 0x5c:0x4 DW_TAG_base_type
+	.byte	4                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	8                               # Abbrev [8] 0x60:0x12 DW_TAG_namespace
+	.byte	3                               # Abbrev [3] 0x61:0x10 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	6                               # DW_AT_name
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	4                               # DW_AT_decl_line
+	.byte	4                               # Abbrev [4] 0x67:0x9 DW_TAG_member
+	.byte	5                               # DW_AT_name
+	.long	92                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	9                               # Abbrev [9] 0x72:0x48 DW_TAG_subprogram
+	.byte	2                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	9                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	25                              # DW_AT_decl_line
+	.long	92                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	10                              # Abbrev [10] 0x81:0xb DW_TAG_formal_parameter
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	120
+	.byte	10                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	25                              # DW_AT_decl_line
+	.long	92                              # DW_AT_type
+	.byte	10                              # Abbrev [10] 0x8c:0xb DW_TAG_formal_parameter
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	112
+	.byte	11                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	25                              # DW_AT_decl_line
+	.long	186                             # DW_AT_type
+	.byte	11                              # Abbrev [11] 0x97:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	88
+	.byte	13                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	26                              # DW_AT_decl_line
+	.long	200                             # DW_AT_type
+	.byte	11                              # Abbrev [11] 0xa2:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	72
+	.byte	18                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	27                              # DW_AT_decl_line
+	.long	234                             # DW_AT_type
+	.byte	11                              # Abbrev [11] 0xad:0xc DW_TAG_variable
+	.byte	3                               # DW_AT_location
+	.byte	145
+	.ascii	"\260\177"
+	.byte	20                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	28                              # DW_AT_decl_line
+	.long	259                             # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	6                               # Abbrev [6] 0xba:0x5 DW_TAG_pointer_type
+	.long	191                             # DW_AT_type
+	.byte	6                               # Abbrev [6] 0xbf:0x5 DW_TAG_pointer_type
+	.long	196                             # DW_AT_type
+	.byte	7                               # Abbrev [7] 0xc4:0x4 DW_TAG_base_type
+	.byte	12                              # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	3                               # Abbrev [3] 0xc8:0x22 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	17                              # DW_AT_name
+	.byte	24                              # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	16                              # DW_AT_decl_line
+	.byte	4                               # Abbrev [4] 0xce:0x9 DW_TAG_member
+	.byte	14                              # DW_AT_name
+	.long	191                             # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	17                              # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	4                               # Abbrev [4] 0xd7:0x9 DW_TAG_member
+	.byte	15                              # DW_AT_name
+	.long	191                             # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	18                              # DW_AT_decl_line
+	.byte	8                               # DW_AT_data_member_location
+	.byte	4                               # Abbrev [4] 0xe0:0x9 DW_TAG_member
+	.byte	16                              # DW_AT_name
+	.long	191                             # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	19                              # DW_AT_decl_line
+	.byte	16                              # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	3                               # Abbrev [3] 0xea:0x19 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	19                              # DW_AT_name
+	.byte	16                              # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	21                              # DW_AT_decl_line
+	.byte	4                               # Abbrev [4] 0xf0:0x9 DW_TAG_member
+	.byte	14                              # DW_AT_name
+	.long	191                             # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	22                              # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	4                               # Abbrev [4] 0xf9:0x9 DW_TAG_member
+	.byte	15                              # DW_AT_name
+	.long	191                             # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	23                              # DW_AT_decl_line
+	.byte	8                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	3                               # Abbrev [3] 0x103:0x22 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	21                              # DW_AT_name
+	.byte	24                              # DW_AT_byte_size
+	.byte	1                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.byte	4                               # Abbrev [4] 0x109:0x9 DW_TAG_member
+	.byte	14                              # DW_AT_name
+	.long	191                             # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	4                               # Abbrev [4] 0x112:0x9 DW_TAG_member
+	.byte	15                              # DW_AT_name
+	.long	191                             # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	8                               # DW_AT_data_member_location
+	.byte	4                               # Abbrev [4] 0x11b:0x9 DW_TAG_member
+	.byte	16                              # DW_AT_name
+	.long	191                             # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	4                               # DW_AT_decl_line
+	.byte	16                              # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	92                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 18.0.0git"       # string offset=0
+.Linfo_string1:
+	.asciz	"main.cpp"                      # string offset=24
+.Linfo_string2:
+	.asciz	"/home/ayermolo/local/tasks/T138552329/typeDedup" # string offset=33
+.Linfo_string3:
+	.asciz	"v1"                            # string offset=81
+.Linfo_string4:
+	.asciz	"t3"                            # string offset=84
+.Linfo_string5:
+	.asciz	"t2<&fooint>"                   # string offset=87
+.Linfo_string6:
+	.asciz	"int"                           # string offset=99
+.Linfo_string7:
+	.asciz	"(anonymous namespace)"         # string offset=103
+.Linfo_string8:
+	.asciz	"t1"                            # string offset=125
+.Linfo_string9:
+	.asciz	"i"                             # string offset=128
+.Linfo_string10:
+	.asciz	"main"                          # string offset=130
+.Linfo_string11:
+	.asciz	"argc"                          # string offset=135
+.Linfo_string12:
+	.asciz	"argv"                          # string offset=140
+.Linfo_string13:
+	.asciz	"char"                          # string offset=145
+.Linfo_string14:
+	.asciz	"f"                             # string offset=150
+.Linfo_string15:
+	.asciz	"Foo"                           # string offset=152
+.Linfo_string16:
+	.asciz	"c1"                            # string offset=156
+.Linfo_string17:
+	.asciz	"c2"                            # string offset=159
+.Linfo_string18:
+	.asciz	"c3"                            # string offset=162
+.Linfo_string19:
+	.asciz	"f2"                            # string offset=165
+.Linfo_string20:
+	.asciz	"Foo2"                          # string offset=168
+.Linfo_string21:
+	.asciz	"f3"                            # string offset=173
+.Linfo_string22:
+	.asciz	"Foo2a"                         # string offset=176
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Linfo_string0
+	.long	.Linfo_string1
+	.long	.Linfo_string2
+	.long	.Linfo_string3
+	.long	.Linfo_string6
+	.long	.Linfo_string9
+	.long	.Linfo_string8
+	.long	.Linfo_string5
+	.long	.Linfo_string4
+	.long	.Linfo_string10
+	.long	.Linfo_string11
+	.long	.Linfo_string12
+	.long	.Linfo_string13
+	.long	.Linfo_string14
+	.long	.Linfo_string16
+	.long	.Linfo_string17
+	.long	.Linfo_string18
+	.long	.Linfo_string15
+	.long	.Linfo_string19
+	.long	.Linfo_string20
+	.long	.Linfo_string21
+	.long	.Linfo_string22
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	fooint
+	.quad	v1
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	0                               # Header: local type unit count
+	.long	0                               # Header: foreign type unit count
+	.long	11                              # Header: bucket count
+	.long	11                              # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.long	1                               # Bucket 0
+	.long	3                               # Bucket 1
+	.long	5                               # Bucket 2
+	.long	0                               # Bucket 3
+	.long	0                               # Bucket 4
+	.long	6                               # Bucket 5
+	.long	8                               # Bucket 6
+	.long	9                               # Bucket 7
+	.long	11                              # Bucket 8
+	.long	0                               # Bucket 9
+	.long	0                               # Bucket 10
+	.long	259227804                       # Hash in Bucket 0
+	.long	2090147939                      # Hash in Bucket 0
+	.long	193491849                       # Hash in Bucket 1
+	.long	958480634                       # Hash in Bucket 1
+	.long	2090263771                      # Hash in Bucket 2
+	.long	5863786                         # Hash in Bucket 5
+	.long	5863852                         # Hash in Bucket 5
+	.long	193495088                       # Hash in Bucket 6
+	.long	5863788                         # Hash in Bucket 7
+	.long	2090499946                      # Hash in Bucket 7
+	.long	-1929613044                     # Hash in Bucket 8
+	.long	.Linfo_string22                 # String in Bucket 0: Foo2a
+	.long	.Linfo_string13                 # String in Bucket 0: char
+	.long	.Linfo_string15                 # String in Bucket 1: Foo
+	.long	.Linfo_string5                  # String in Bucket 1: t2<&fooint>
+	.long	.Linfo_string20                 # String in Bucket 2: Foo2
+	.long	.Linfo_string8                  # String in Bucket 5: t1
+	.long	.Linfo_string3                  # String in Bucket 5: v1
+	.long	.Linfo_string6                  # String in Bucket 6: int
+	.long	.Linfo_string4                  # String in Bucket 7: t3
+	.long	.Linfo_string10                 # String in Bucket 7: main
+	.long	.Linfo_string7                  # String in Bucket 8: (anonymous namespace)
+	.long	.Lnames10-.Lnames_entries0      # Offset in Bucket 0
+	.long	.Lnames7-.Lnames_entries0       # Offset in Bucket 0
+	.long	.Lnames8-.Lnames_entries0       # Offset in Bucket 1
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 1
+	.long	.Lnames9-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames4-.Lnames_entries0       # Offset in Bucket 5
+	.long	.Lnames5-.Lnames_entries0       # Offset in Bucket 5
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 6
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 7
+	.long	.Lnames6-.Lnames_entries0       # Offset in Bucket 7
+	.long	.Lnames3-.Lnames_entries0       # Offset in Bucket 8
+.Lnames_abbrev_start0:
+	.ascii	"\2309"                         # Abbrev code
+	.byte	57                              # DW_TAG_namespace
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\270\023"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\230\023"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\230$"                         # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\2304"                         # Abbrev code
+	.byte	52                              # DW_TAG_variable
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\230."                         # Abbrev code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames10:
+.L1:
+	.ascii	"\230\023"                      # Abbreviation code
+	.long	259                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: Foo2a
+.Lnames7:
+.L8:
+	.ascii	"\230$"                         # Abbreviation code
+	.long	196                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: char
+.Lnames8:
+.L0:
+	.ascii	"\230\023"                      # Abbreviation code
+	.long	200                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: Foo
+.Lnames1:
+.L2:
+	.ascii	"\230\023"                      # Abbreviation code
+	.long	62                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: t2<&fooint>
+.Lnames9:
+.L9:
+	.ascii	"\230\023"                      # Abbreviation code
+	.long	234                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: Foo2
+.Lnames4:
+.L5:
+	.ascii	"\270\023"                      # Abbreviation code
+	.long	97                              # DW_IDX_die_offset
+	.long	.L3-.Lnames_entries0            # DW_IDX_parent
+	.byte	0                               # End of list: t1
+.Lnames5:
+.L7:
+	.ascii	"\2304"                         # Abbreviation code
+	.long	35                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: v1
+.Lnames2:
+.L10:
+	.ascii	"\230$"                         # Abbreviation code
+	.long	92                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: int
+.Lnames0:
+.L6:
+	.ascii	"\230\023"                      # Abbreviation code
+	.long	46                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: t3
+.Lnames6:
+.L4:
+	.ascii	"\230."                         # Abbreviation code
+	.long	114                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: main
+.Lnames3:
+.L3:
+	.ascii	"\2309"                         # Abbreviation code
+	.long	96                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: (anonymous namespace)
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 18.0.0git"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/dwarf5-df-debug-names-helper.s b/bolt/test/X86/Inputs/dwarf5-df-debug-names-helper.s
new file mode 100644
index 000000000000..8a53bda5f712
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-df-debug-names-helper.s
@@ -0,0 +1,326 @@
+# clang++ -gsplit-dwarf -g2 -gdwarf-5 -gpubnames -fdebug-compilation-dir='.'
+# header.h
+# struct Foo2a {
+#   char *c1;
+#   char *c2;
+#   char *c3;
+# };
+# helper.cpp
+# #include "header.h"
+# struct Foo2Int {
+#    int *c1;
+#    int *c2;
+# };
+# Foo2Int fint;
+# const Foo2a f{nullptr, nullptr};
+
+	.text
+	.file	"helper.cpp"
+	.file	0 "." "helper.cpp" md5 0x2804efac708fd4180d403e6d5dbcc54a
+	.type	fint,@object                    # @fint
+	.bss
+	.globl	fint
+	.p2align	3, 0x0
+fint:
+	.zero	16
+	.size	fint, 16
+
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	74                              # DW_TAG_skeleton_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	4                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	3223434782003797151
+	.byte	1                               # Abbrev [1] 0x14:0xf DW_TAG_skeleton_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	0                               # DW_AT_comp_dir
+	.byte	1                               # DW_AT_dwo_name
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	12                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"."                             # string offset=0
+.Lskel_string1:
+	.asciz	"Foo2Int"                       # string offset=2
+.Lskel_string2:
+	.asciz	"int"                           # string offset=10
+.Lskel_string3:
+	.asciz	"fint"                          # string offset=14
+.Lskel_string4:
+	.asciz	"helper.dwo"                    # string offset=19
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Lskel_string0
+	.long	.Lskel_string4
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	36                              # Length of String Offsets Set
+	.short	5
+	.short	0
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"fint"                          # string offset=0
+.Linfo_string1:
+	.asciz	"c1"                            # string offset=5
+.Linfo_string2:
+	.asciz	"int"                           # string offset=8
+.Linfo_string3:
+	.asciz	"c2"                            # string offset=12
+.Linfo_string4:
+	.asciz	"Foo2Int"                       # string offset=15
+.Linfo_string5:
+	.asciz	"clang version 19.0.0git (git@github.com:ayermolo/llvm-project.git da9e9277be64deca73370a90d22af33e5b37cc52)" # string offset=23
+.Linfo_string6:
+	.asciz	"helper.cpp"                    # string offset=131
+.Linfo_string7:
+	.asciz	"helper.dwo"                    # string offset=142
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	0
+	.long	5
+	.long	8
+	.long	12
+	.long	15
+	.long	23
+	.long	131
+	.long	142
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	5                               # DWARF version number
+	.byte	5                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	3223434782003797151
+	.byte	1                               # Abbrev [1] 0x14:0x34 DW_TAG_compile_unit
+	.byte	5                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	6                               # DW_AT_name
+	.byte	7                               # DW_AT_dwo_name
+	.byte	2                               # Abbrev [2] 0x1a:0xb DW_TAG_variable
+	.byte	0                               # DW_AT_name
+	.long	37                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	0                               # DW_AT_decl_file
+	.byte	7                               # DW_AT_decl_line
+	.byte	2                               # DW_AT_location
+	.byte	161
+	.byte	0
+	.byte	3                               # Abbrev [3] 0x25:0x19 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	4                               # DW_AT_name
+	.byte	16                              # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	4                               # Abbrev [4] 0x2b:0x9 DW_TAG_member
+	.byte	1                               # DW_AT_name
+	.long	62                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	4                               # Abbrev [4] 0x34:0x9 DW_TAG_member
+	.byte	3                               # DW_AT_name
+	.long	62                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	4                               # DW_AT_decl_line
+	.byte	8                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	5                               # Abbrev [5] 0x3e:0x5 DW_TAG_pointer_type
+	.long	67                              # DW_AT_type
+	.byte	6                               # Abbrev [6] 0x43:0x4 DW_TAG_base_type
+	.byte	2                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	15                              # DW_TAG_pointer_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	fint
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	0                               # Header: local type unit count
+	.long	0                               # Header: foreign type unit count
+	.long	3                               # Header: bucket count
+	.long	3                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.long	1                               # Bucket 0
+	.long	2                               # Bucket 1
+	.long	3                               # Bucket 2
+	.long	-1168750522                     # Hash in Bucket 0
+	.long	2090257270                      # Hash in Bucket 1
+	.long	193495088                       # Hash in Bucket 2
+	.long	.Lskel_string1                  # String in Bucket 0: Foo2Int
+	.long	.Lskel_string3                  # String in Bucket 1: fint
+	.long	.Lskel_string2                  # String in Bucket 2: int
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 0
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 1
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 2
+.Lnames_abbrev_start0:
+	.ascii	"\230\023"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\2304"                         # Abbrev code
+	.byte	52                              # DW_TAG_variable
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\230$"                         # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames0:
+.L1:
+	.ascii	"\230\023"                      # Abbreviation code
+	.long	37                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: Foo2Int
+.Lnames2:
+.L0:
+	.ascii	"\2304"                         # Abbreviation code
+	.long	26                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: fint
+.Lnames1:
+.L2:
+	.ascii	"\230$"                         # Abbreviation code
+	.long	67                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: int
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 19.0.0git (git@github.com:ayermolo/llvm-project.git da9e9277be64deca73370a90d22af33e5b37cc52)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/dwarf5-df-debug-names-main.s b/bolt/test/X86/Inputs/dwarf5-df-debug-names-main.s
new file mode 100644
index 000000000000..7ca721323917
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-df-debug-names-main.s
@@ -0,0 +1,493 @@
+# clang++ -gsplit-dwarf -g2 -gdwarf-5 -gpubnames -fdebug-compilation-dir='.'
+# header.h
+# struct Foo2a {
+#   char *c1;
+#   char *c2;
+#   char *c3;
+# };
+# main.cpp
+# #include "header.h"
+# struct Foo2 {
+#  char *c1;
+# };
+# int main(int argc, char *argv[]) {
+#  Foo2 f2;
+#  Foo2a f3;
+#  return 0;
+# }
+
+	.text
+	.file	"main.cpp"
+	.globl	main                            # -- Begin function main
+	.p2align	4, 0x90
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.file	0 "." "main.cpp" md5 0x9c5cea5bb78d3fc265cd175110bfe903
+	.loc	0 5 0                           # main.cpp:5:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	movl	$0, -4(%rbp)
+	movl	%edi, -8(%rbp)
+	movq	%rsi, -16(%rbp)
+.Ltmp0:
+	.loc	0 8 2 prologue_end              # main.cpp:8:2
+	xorl	%eax, %eax
+	.loc	0 8 2 epilogue_begin is_stmt 0  # main.cpp:8:2
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+                                        # -- End function
+	.file	1 "." "header.h" md5 0xfea7bb1f22c47f129e15695f7137a1e7
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	74                              # DW_TAG_skeleton_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	4                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	-5618023701701543936
+	.byte	1                               # Abbrev [1] 0x14:0x14 DW_TAG_skeleton_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	0                               # DW_AT_comp_dir
+	.byte	1                               # DW_AT_dwo_name
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	12                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"."                             # string offset=0
+.Lskel_string1:
+	.asciz	"main"                          # string offset=2
+.Lskel_string2:
+	.asciz	"int"                           # string offset=7
+.Lskel_string3:
+	.asciz	"char"                          # string offset=11
+.Lskel_string4:
+	.asciz	"Foo2"                          # string offset=16
+.Lskel_string5:
+	.asciz	"Foo2a"                         # string offset=21
+.Lskel_string6:
+	.asciz	"main.dwo"                      # string offset=27
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Lskel_string0
+	.long	.Lskel_string6
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	64                              # Length of String Offsets Set
+	.short	5
+	.short	0
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"main"                          # string offset=0
+.Linfo_string1:
+	.asciz	"int"                           # string offset=5
+.Linfo_string2:
+	.asciz	"argc"                          # string offset=9
+.Linfo_string3:
+	.asciz	"argv"                          # string offset=14
+.Linfo_string4:
+	.asciz	"char"                          # string offset=19
+.Linfo_string5:
+	.asciz	"f2"                            # string offset=24
+.Linfo_string6:
+	.asciz	"c1"                            # string offset=27
+.Linfo_string7:
+	.asciz	"Foo2"                          # string offset=30
+.Linfo_string8:
+	.asciz	"f3"                            # string offset=35
+.Linfo_string9:
+	.asciz	"c2"                            # string offset=38
+.Linfo_string10:
+	.asciz	"c3"                            # string offset=41
+.Linfo_string11:
+	.asciz	"Foo2a"                         # string offset=44
+.Linfo_string12:
+	.asciz	"clang version 19.0.0git (git@github.com:ayermolo/llvm-project.git da9e9277be64deca73370a90d22af33e5b37cc52)" # string offset=50
+.Linfo_string13:
+	.asciz	"main.cpp"                      # string offset=158
+.Linfo_string14:
+	.asciz	"main.dwo"                      # string offset=167
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	0
+	.long	5
+	.long	9
+	.long	14
+	.long	19
+	.long	24
+	.long	27
+	.long	30
+	.long	35
+	.long	38
+	.long	41
+	.long	44
+	.long	50
+	.long	158
+	.long	167
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	5                               # DWARF version number
+	.byte	5                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	-5618023701701543936
+	.byte	1                               # Abbrev [1] 0x14:0x87 DW_TAG_compile_unit
+	.byte	12                              # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	13                              # DW_AT_name
+	.byte	14                              # DW_AT_dwo_name
+	.byte	2                               # Abbrev [2] 0x1a:0x3c DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	0                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.long	86                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	3                               # Abbrev [3] 0x29:0xb DW_TAG_formal_parameter
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	120
+	.byte	2                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.long	86                              # DW_AT_type
+	.byte	3                               # Abbrev [3] 0x34:0xb DW_TAG_formal_parameter
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	112
+	.byte	3                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.long	90                              # DW_AT_type
+	.byte	4                               # Abbrev [4] 0x3f:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	104
+	.byte	5                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	6                               # DW_AT_decl_line
+	.long	104                             # DW_AT_type
+	.byte	4                               # Abbrev [4] 0x4a:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	80
+	.byte	8                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	7                               # DW_AT_decl_line
+	.long	120                             # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	5                               # Abbrev [5] 0x56:0x4 DW_TAG_base_type
+	.byte	1                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	6                               # Abbrev [6] 0x5a:0x5 DW_TAG_pointer_type
+	.long	95                              # DW_AT_type
+	.byte	6                               # Abbrev [6] 0x5f:0x5 DW_TAG_pointer_type
+	.long	100                             # DW_AT_type
+	.byte	5                               # Abbrev [5] 0x64:0x4 DW_TAG_base_type
+	.byte	4                               # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	7                               # Abbrev [7] 0x68:0x10 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	7                               # DW_AT_name
+	.byte	8                               # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0x6e:0x9 DW_TAG_member
+	.byte	6                               # DW_AT_name
+	.long	95                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	7                               # Abbrev [7] 0x78:0x22 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	11                              # DW_AT_name
+	.byte	24                              # DW_AT_byte_size
+	.byte	1                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0x7e:0x9 DW_TAG_member
+	.byte	6                               # DW_AT_name
+	.long	95                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	8                               # Abbrev [8] 0x87:0x9 DW_TAG_member
+	.byte	9                               # DW_AT_name
+	.long	95                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	8                               # DW_AT_data_member_location
+	.byte	8                               # Abbrev [8] 0x90:0x9 DW_TAG_member
+	.byte	10                              # DW_AT_name
+	.long	95                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	4                               # DW_AT_decl_line
+	.byte	16                              # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	15                              # DW_TAG_pointer_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	0                               # Header: local type unit count
+	.long	0                               # Header: foreign type unit count
+	.long	5                               # Header: bucket count
+	.long	5                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.long	0                               # Bucket 0
+	.long	1                               # Bucket 1
+	.long	0                               # Bucket 2
+	.long	3                               # Bucket 3
+	.long	4                               # Bucket 4
+	.long	2090263771                      # Hash in Bucket 1
+	.long	2090499946                      # Hash in Bucket 1
+	.long	193495088                       # Hash in Bucket 3
+	.long	259227804                       # Hash in Bucket 4
+	.long	2090147939                      # Hash in Bucket 4
+	.long	.Lskel_string4                  # String in Bucket 1: Foo2
+	.long	.Lskel_string1                  # String in Bucket 1: main
+	.long	.Lskel_string2                  # String in Bucket 3: int
+	.long	.Lskel_string5                  # String in Bucket 4: Foo2a
+	.long	.Lskel_string3                  # String in Bucket 4: char
+	.long	.Lnames3-.Lnames_entries0       # Offset in Bucket 1
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 1
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 3
+	.long	.Lnames4-.Lnames_entries0       # Offset in Bucket 4
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 4
+.Lnames_abbrev_start0:
+	.ascii	"\230\023"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\230."                         # Abbrev code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\230$"                         # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames3:
+.L4:
+	.ascii	"\230\023"                      # Abbreviation code
+	.long	104                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: Foo2
+.Lnames0:
+.L1:
+	.ascii	"\230."                         # Abbreviation code
+	.long	26                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: main
+.Lnames1:
+.L3:
+	.ascii	"\230$"                         # Abbreviation code
+	.long	86                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: int
+.Lnames4:
+.L2:
+	.ascii	"\230\023"                      # Abbreviation code
+	.long	120                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: Foo2a
+.Lnames2:
+.L0:
+	.ascii	"\230$"                         # Abbreviation code
+	.long	100                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: char
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 19.0.0git (git@github.com:ayermolo/llvm-project.git da9e9277be64deca73370a90d22af33e5b37cc52)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/dwarf5-df-types-debug-names-helper.s b/bolt/test/X86/Inputs/dwarf5-df-types-debug-names-helper.s
new file mode 100644
index 000000000000..10842852eccd
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-df-types-debug-names-helper.s
@@ -0,0 +1,650 @@
+# clang++ -gsplit-dwarf -g2 -gdwarf-5 -gpubnames -fdebug-types-section -fdebug-compilation-dir='.' -S
+# header.h
+# struct Foo2a {
+#   char *c1;
+#   char *c2;
+#   char *c3;
+# };
+# #include "header.h"
+# struct Foo2Int {
+#    int *c1;
+#    int *c2;
+# };
+# Foo2Int fint;
+# const Foo2a f{nullptr, nullptr};
+
+	.text
+	.file	"helper.cpp"
+	.file	0 "." "helper.cpp" md5 0xc33186b2db66a78883b1546aace9855d
+	.globl	_Z3foov                         # -- Begin function _Z3foov
+	.p2align	4, 0x90
+	.type	_Z3foov,@function
+_Z3foov:                                # @_Z3foov
+.Lfunc_begin0:
+	.loc	0 8 0                           # helper.cpp:8:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+.Ltmp0:
+	.loc	0 11 3 prologue_end             # helper.cpp:11:3
+	xorl	%eax, %eax
+	.loc	0 11 3 epilogue_begin is_stmt 0 # helper.cpp:11:3
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	_Z3foov, .Lfunc_end0-_Z3foov
+	.cfi_endproc
+                                        # -- End function
+	.type	fooint,@object                  # @fooint
+	.bss
+	.globl	fooint
+	.p2align	2, 0x0
+fooint:
+	.long	0                               # 0x0
+	.size	fooint, 4
+
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	5                               # DWARF version number
+	.byte	6                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	-3882554063269480080            # Type Signature
+	.long	33                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x18:0x2c DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.byte	5                               # DW_AT_comp_dir
+	.byte	6                               # DW_AT_dwo_name
+	.long	0                               # DW_AT_stmt_list
+	.byte	2                               # Abbrev [2] 0x21:0x19 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	9                               # DW_AT_name
+	.byte	16                              # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	3                               # Abbrev [3] 0x27:0x9 DW_TAG_member
+	.byte	7                               # DW_AT_name
+	.long	58                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	4                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	3                               # Abbrev [3] 0x30:0x9 DW_TAG_member
+	.byte	8                               # DW_AT_name
+	.long	58                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.byte	8                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x3a:0x5 DW_TAG_pointer_type
+	.long	63                              # DW_AT_type
+	.byte	5                               # Abbrev [5] 0x3f:0x4 DW_TAG_base_type
+	.byte	1                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.long	.Ldebug_info_dwo_end1-.Ldebug_info_dwo_start1 # Length of Unit
+.Ldebug_info_dwo_start1:
+	.short	5                               # DWARF version number
+	.byte	6                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	1175092228111723119             # Type Signature
+	.long	33                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x18:0x35 DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.byte	5                               # DW_AT_comp_dir
+	.byte	6                               # DW_AT_dwo_name
+	.long	0                               # DW_AT_stmt_list
+	.byte	2                               # Abbrev [2] 0x21:0x22 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	13                              # DW_AT_name
+	.byte	24                              # DW_AT_byte_size
+	.byte	1                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.byte	3                               # Abbrev [3] 0x27:0x9 DW_TAG_member
+	.byte	7                               # DW_AT_name
+	.long	67                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	3                               # Abbrev [3] 0x30:0x9 DW_TAG_member
+	.byte	8                               # DW_AT_name
+	.long	67                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	8                               # DW_AT_data_member_location
+	.byte	3                               # Abbrev [3] 0x39:0x9 DW_TAG_member
+	.byte	12                              # DW_AT_name
+	.long	67                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	4                               # DW_AT_decl_line
+	.byte	16                              # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x43:0x5 DW_TAG_pointer_type
+	.long	72                              # DW_AT_type
+	.byte	5                               # Abbrev [5] 0x48:0x4 DW_TAG_base_type
+	.byte	11                              # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end1:
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	74                              # DW_TAG_skeleton_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	4                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	2142419470755914572
+	.byte	1                               # Abbrev [1] 0x14:0x14 DW_TAG_skeleton_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	0                               # DW_AT_comp_dir
+	.byte	1                               # DW_AT_dwo_name
+	.byte	1                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	12                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"."                             # string offset=0
+.Lskel_string1:
+	.asciz	"int"                           # string offset=2
+.Lskel_string2:
+	.asciz	"fooint"                        # string offset=6
+.Lskel_string3:
+	.asciz	"foo"                           # string offset=13
+.Lskel_string4:
+	.asciz	"_Z3foov"                       # string offset=17
+.Lskel_string5:
+	.asciz	"Foo2Int"                       # string offset=25
+.Lskel_string6:
+	.asciz	"Foo2a"                         # string offset=33
+.Lskel_string7:
+	.asciz	"char"                          # string offset=39
+.Lskel_string8:
+	.asciz	"helper.dwo"                    # string offset=44
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Lskel_string0
+	.long	.Lskel_string8
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	68                              # Length of String Offsets Set
+	.short	5
+	.short	0
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"fooint"                        # string offset=0
+.Linfo_string1:
+	.asciz	"int"                           # string offset=7
+.Linfo_string2:
+	.asciz	"_Z3foov"                       # string offset=11
+.Linfo_string3:
+	.asciz	"foo"                           # string offset=19
+.Linfo_string4:
+	.asciz	"fint"                          # string offset=23
+.Linfo_string5:
+	.asciz	"."                             # string offset=28
+.Linfo_string6:
+	.asciz	"helper.dwo"                    # string offset=30
+.Linfo_string7:
+	.asciz	"c1"                            # string offset=41
+.Linfo_string8:
+	.asciz	"c2"                            # string offset=44
+.Linfo_string9:
+	.asciz	"Foo2Int"                       # string offset=47
+.Linfo_string10:
+	.asciz	"f"                             # string offset=55
+.Linfo_string11:
+	.asciz	"char"                          # string offset=57
+.Linfo_string12:
+	.asciz	"c3"                            # string offset=62
+.Linfo_string13:
+	.asciz	"Foo2a"                         # string offset=65
+.Linfo_string14:
+	.asciz	"clang version 18.0.0git (git@github.com:ayermolo/llvm-project.git db35fa8fc524127079662802c4735dbf397f86d0)" # string offset=71
+.Linfo_string15:
+	.asciz	"helper.cpp"                    # string offset=179
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	0
+	.long	7
+	.long	11
+	.long	19
+	.long	23
+	.long	28
+	.long	30
+	.long	41
+	.long	44
+	.long	47
+	.long	55
+	.long	57
+	.long	62
+	.long	65
+	.long	71
+	.long	179
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end2-.Ldebug_info_dwo_start2 # Length of Unit
+.Ldebug_info_dwo_start2:
+	.short	5                               # DWARF version number
+	.byte	5                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	2142419470755914572
+	.byte	6                               # Abbrev [6] 0x14:0x4f DW_TAG_compile_unit
+	.byte	14                              # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	15                              # DW_AT_name
+	.byte	6                               # DW_AT_dwo_name
+	.byte	7                               # Abbrev [7] 0x1a:0xb DW_TAG_variable
+	.byte	0                               # DW_AT_name
+	.long	37                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	2                               # DW_AT_location
+	.byte	161
+	.byte	0
+	.byte	5                               # Abbrev [5] 0x25:0x4 DW_TAG_base_type
+	.byte	1                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	8                               # Abbrev [8] 0x29:0x27 DW_TAG_subprogram
+	.byte	1                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	2                               # DW_AT_linkage_name
+	.byte	3                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	8                               # DW_AT_decl_line
+	.long	37                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	9                               # Abbrev [9] 0x39:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	112
+	.byte	4                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	9                               # DW_AT_decl_line
+	.long	80                              # DW_AT_type
+	.byte	9                               # Abbrev [9] 0x44:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	88
+	.byte	10                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	10                              # DW_AT_decl_line
+	.long	89                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	10                              # Abbrev [10] 0x50:0x9 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	-3882554063269480080            # DW_AT_signature
+	.byte	10                              # Abbrev [10] 0x59:0x9 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	1175092228111723119             # DW_AT_signature
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end2:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	65                              # DW_TAG_type_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	15                              # DW_TAG_pointer_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	110                             # DW_AT_linkage_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	9                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	10                              # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_line.dwo,"e",@progbits
+.Ltmp2:
+	.long	.Ldebug_line_end0-.Ldebug_line_start0 # unit length
+.Ldebug_line_start0:
+	.short	5
+	.byte	8
+	.byte	0
+	.long	.Lprologue_end0-.Lprologue_start0
+.Lprologue_start0:
+	.byte	1
+	.byte	1
+	.byte	1
+	.byte	-5
+	.byte	14
+	.byte	1
+	.byte	1
+	.byte	1
+	.byte	8
+	.byte	2
+	.byte	46
+	.byte	0
+	.byte	46
+	.byte	0
+	.byte	3
+	.byte	1
+	.byte	8
+	.byte	2
+	.byte	15
+	.byte	5
+	.byte	30
+	.byte	2
+	.ascii	"helper.cpp"
+	.byte	0
+	.byte	0
+	.byte	0xc3, 0x31, 0x86, 0xb2
+	.byte	0xdb, 0x66, 0xa7, 0x88
+	.byte	0x83, 0xb1, 0x54, 0x6a
+	.byte	0xac, 0xe9, 0x85, 0x5d
+	.ascii	"header.h"
+	.byte	0
+	.byte	1
+	.byte	0xfe, 0xa7, 0xbb, 0x1f
+	.byte	0x22, 0xc4, 0x7f, 0x12
+	.byte	0x9e, 0x15, 0x69, 0x5f
+	.byte	0x71, 0x37, 0xa1, 0xe7
+.Lprologue_end0:
+.Ldebug_line_end0:
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	fooint
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	0                               # Header: local type unit count
+	.long	2                               # Header: foreign type unit count
+	.long	7                               # Header: bucket count
+	.long	7                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.quad	-3882554063269480080            # Type unit 0
+	.quad	1175092228111723119             # Type unit 1
+	.long	1                               # Bucket 0
+	.long	0                               # Bucket 1
+	.long	2                               # Bucket 2
+	.long	3                               # Bucket 3
+	.long	0                               # Bucket 4
+	.long	5                               # Bucket 5
+	.long	7                               # Bucket 6
+	.long	-1257882357                     # Hash in Bucket 0
+	.long	-1168750522                     # Hash in Bucket 2
+	.long	193495088                       # Hash in Bucket 3
+	.long	259227804                       # Hash in Bucket 3
+	.long	193491849                       # Hash in Bucket 5
+	.long	2090147939                      # Hash in Bucket 5
+	.long	-35356620                       # Hash in Bucket 6
+	.long	.Lskel_string4                  # String in Bucket 0: _Z3foov
+	.long	.Lskel_string5                  # String in Bucket 2: Foo2Int
+	.long	.Lskel_string1                  # String in Bucket 3: int
+	.long	.Lskel_string6                  # String in Bucket 3: Foo2a
+	.long	.Lskel_string3                  # String in Bucket 5: foo
+	.long	.Lskel_string7                  # String in Bucket 5: char
+	.long	.Lskel_string2                  # String in Bucket 6: fooint
+	.long	.Lnames3-.Lnames_entries0       # Offset in Bucket 0
+	.long	.Lnames4-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 3
+	.long	.Lnames5-.Lnames_entries0       # Offset in Bucket 3
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 5
+	.long	.Lnames6-.Lnames_entries0       # Offset in Bucket 5
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 6
+.Lnames_abbrev_start0:
+	.ascii	"\350\004"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\354\004"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\310\013"                      # Abbrev code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\210\t"                        # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\210\r"                        # Abbrev code
+	.byte	52                              # DW_TAG_variable
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\214\t"                        # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames3:
+	.ascii	"\310\013"                      # Abbreviation code
+	.long	41                              # DW_IDX_die_offset
+	.byte	0                               # End of list: _Z3foov
+.Lnames4:
+	.ascii	"\354\004"                      # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	33                              # DW_IDX_die_offset
+	.ascii	"\350\004"                      # Abbreviation code
+	.long	80                              # DW_IDX_die_offset
+	.byte	0                               # End of list: Foo2Int
+.Lnames0:
+	.ascii	"\210\t"                        # Abbreviation code
+	.long	37                              # DW_IDX_die_offset
+	.ascii	"\214\t"                        # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	63                              # DW_IDX_die_offset
+	.byte	0                               # End of list: int
+.Lnames5:
+	.ascii	"\354\004"                      # Abbreviation code
+	.byte	1                               # DW_IDX_type_unit
+	.long	33                              # DW_IDX_die_offset
+	.ascii	"\350\004"                      # Abbreviation code
+	.long	89                              # DW_IDX_die_offset
+	.byte	0                               # End of list: Foo2a
+.Lnames2:
+	.ascii	"\310\013"                      # Abbreviation code
+	.long	41                              # DW_IDX_die_offset
+	.byte	0                               # End of list: foo
+.Lnames6:
+	.ascii	"\214\t"                        # Abbreviation code
+	.byte	1                               # DW_IDX_type_unit
+	.long	72                              # DW_IDX_die_offset
+	.byte	0                               # End of list: char
+.Lnames1:
+	.ascii	"\210\r"                        # Abbreviation code
+	.long	26                              # DW_IDX_die_offset
+	.byte	0                               # End of list: fooint
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 18.0.0git (git@github.com:ayermolo/llvm-project.git db35fa8fc524127079662802c4735dbf397f86d0)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/dwarf5-df-types-debug-names-main.s b/bolt/test/X86/Inputs/dwarf5-df-types-debug-names-main.s
new file mode 100644
index 000000000000..f89f28ec13f4
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-df-types-debug-names-main.s
@@ -0,0 +1,626 @@
+# clang++ -gsplit-dwarf -g2 -gdwarf-5 -gpubnames -fdebug-types-section -fdebug-compilation-dir='.' -S
+# header.h
+# struct Foo2a {
+#   char *c1;
+#   char *c2;
+#   char *c3;
+# };
+# include "header.h"
+# struct Foo2 {
+#  char *c1;
+# };
+# int main(int argc, char *argv[]) {
+#  Foo2 f2;
+#  Foo2a f3;
+#  return 0;
+# }
+
+	.text
+	.file	"main.cpp"
+	.globl	main                            # -- Begin function main
+	.p2align	4, 0x90
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.file	0 "." "main.cpp" md5 0x9c5cea5bb78d3fc265cd175110bfe903
+	.loc	0 5 0                           # main.cpp:5:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	movl	$0, -4(%rbp)
+	movl	%edi, -8(%rbp)
+	movq	%rsi, -16(%rbp)
+.Ltmp0:
+	.loc	0 8 2 prologue_end              # main.cpp:8:2
+	xorl	%eax, %eax
+	.loc	0 8 2 epilogue_begin is_stmt 0  # main.cpp:8:2
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+                                        # -- End function
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	5                               # DWARF version number
+	.byte	6                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	5322170643381124694             # Type Signature
+	.long	33                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x18:0x23 DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.byte	6                               # DW_AT_comp_dir
+	.byte	7                               # DW_AT_dwo_name
+	.long	0                               # DW_AT_stmt_list
+	.byte	2                               # Abbrev [2] 0x21:0x10 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	9                               # DW_AT_name
+	.byte	8                               # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	3                               # Abbrev [3] 0x27:0x9 DW_TAG_member
+	.byte	8                               # DW_AT_name
+	.long	49                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x31:0x5 DW_TAG_pointer_type
+	.long	54                              # DW_AT_type
+	.byte	5                               # Abbrev [5] 0x36:0x4 DW_TAG_base_type
+	.byte	4                               # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.long	.Ldebug_info_dwo_end1-.Ldebug_info_dwo_start1 # Length of Unit
+.Ldebug_info_dwo_start1:
+	.short	5                               # DWARF version number
+	.byte	6                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	1175092228111723119             # Type Signature
+	.long	33                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x18:0x35 DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.byte	6                               # DW_AT_comp_dir
+	.byte	7                               # DW_AT_dwo_name
+	.long	0                               # DW_AT_stmt_list
+	.byte	2                               # Abbrev [2] 0x21:0x22 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	13                              # DW_AT_name
+	.byte	24                              # DW_AT_byte_size
+	.byte	1                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.byte	3                               # Abbrev [3] 0x27:0x9 DW_TAG_member
+	.byte	8                               # DW_AT_name
+	.long	67                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	3                               # Abbrev [3] 0x30:0x9 DW_TAG_member
+	.byte	11                              # DW_AT_name
+	.long	67                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	8                               # DW_AT_data_member_location
+	.byte	3                               # Abbrev [3] 0x39:0x9 DW_TAG_member
+	.byte	12                              # DW_AT_name
+	.long	67                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	4                               # DW_AT_decl_line
+	.byte	16                              # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x43:0x5 DW_TAG_pointer_type
+	.long	72                              # DW_AT_type
+	.byte	5                               # Abbrev [5] 0x48:0x4 DW_TAG_base_type
+	.byte	4                               # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end1:
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	74                              # DW_TAG_skeleton_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	4                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	5962099678818150071
+	.byte	1                               # Abbrev [1] 0x14:0x14 DW_TAG_skeleton_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	0                               # DW_AT_comp_dir
+	.byte	1                               # DW_AT_dwo_name
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	12                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"." # string offset=0
+.Lskel_string1:
+	.asciz	"main"                          # string offset=53
+.Lskel_string2:
+	.asciz	"int"                           # string offset=58
+.Lskel_string3:
+	.asciz	"char"                          # string offset=62
+.Lskel_string4:
+	.asciz	"Foo2"                          # string offset=67
+.Lskel_string5:
+	.asciz	"Foo2a"                         # string offset=72
+.Lskel_string6:
+	.asciz	"main.dwo"                      # string offset=78
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Lskel_string0
+	.long	.Lskel_string6
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	68                              # Length of String Offsets Set
+	.short	5
+	.short	0
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"main"                          # string offset=0
+.Linfo_string1:
+	.asciz	"int"                           # string offset=5
+.Linfo_string2:
+	.asciz	"argc"                          # string offset=9
+.Linfo_string3:
+	.asciz	"argv"                          # string offset=14
+.Linfo_string4:
+	.asciz	"char"                          # string offset=19
+.Linfo_string5:
+	.asciz	"f2"                            # string offset=24
+.Linfo_string6:
+	.asciz	"/home/ayermolo/local/tasks/T138552329/typeDedupSplit" # string offset=27
+.Linfo_string7:
+	.asciz	"main.dwo"                      # string offset=80
+.Linfo_string8:
+	.asciz	"c1"                            # string offset=89
+.Linfo_string9:
+	.asciz	"Foo2"                          # string offset=92
+.Linfo_string10:
+	.asciz	"f3"                            # string offset=97
+.Linfo_string11:
+	.asciz	"c2"                            # string offset=100
+.Linfo_string12:
+	.asciz	"c3"                            # string offset=103
+.Linfo_string13:
+	.asciz	"Foo2a"                         # string offset=106
+.Linfo_string14:
+	.asciz	"clang version 18.0.0git (git@github.com:ayermolo/llvm-project.git db35fa8fc524127079662802c4735dbf397f86d0)" # string offset=112
+.Linfo_string15:
+	.asciz	"main.cpp"                      # string offset=220
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	0
+	.long	5
+	.long	9
+	.long	14
+	.long	19
+	.long	24
+	.long	27
+	.long	80
+	.long	89
+	.long	92
+	.long	97
+	.long	100
+	.long	103
+	.long	106
+	.long	112
+	.long	220
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end2-.Ldebug_info_dwo_start2 # Length of Unit
+.Ldebug_info_dwo_start2:
+	.short	5                               # DWARF version number
+	.byte	5                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	5962099678818150071
+	.byte	6                               # Abbrev [6] 0x14:0x67 DW_TAG_compile_unit
+	.byte	14                              # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	15                              # DW_AT_name
+	.byte	7                               # DW_AT_dwo_name
+	.byte	7                               # Abbrev [7] 0x1a:0x3c DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	0                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.long	86                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	8                               # Abbrev [8] 0x29:0xb DW_TAG_formal_parameter
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	120
+	.byte	2                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.long	86                              # DW_AT_type
+	.byte	8                               # Abbrev [8] 0x34:0xb DW_TAG_formal_parameter
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	112
+	.byte	3                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.long	90                              # DW_AT_type
+	.byte	9                               # Abbrev [9] 0x3f:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	104
+	.byte	5                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	6                               # DW_AT_decl_line
+	.long	104                             # DW_AT_type
+	.byte	9                               # Abbrev [9] 0x4a:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	80
+	.byte	10                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	7                               # DW_AT_decl_line
+	.long	113                             # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	5                               # Abbrev [5] 0x56:0x4 DW_TAG_base_type
+	.byte	1                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	4                               # Abbrev [4] 0x5a:0x5 DW_TAG_pointer_type
+	.long	95                              # DW_AT_type
+	.byte	4                               # Abbrev [4] 0x5f:0x5 DW_TAG_pointer_type
+	.long	100                             # DW_AT_type
+	.byte	5                               # Abbrev [5] 0x64:0x4 DW_TAG_base_type
+	.byte	4                               # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	10                              # Abbrev [10] 0x68:0x9 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	5322170643381124694             # DW_AT_signature
+	.byte	10                              # Abbrev [10] 0x71:0x9 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	1175092228111723119             # DW_AT_signature
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end2:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	65                              # DW_TAG_type_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	15                              # DW_TAG_pointer_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	9                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	10                              # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_line.dwo,"e",@progbits
+.Ltmp2:
+	.long	.Ldebug_line_end0-.Ldebug_line_start0 # unit length
+.Ldebug_line_start0:
+	.short	5
+	.byte	8
+	.byte	0
+	.long	.Lprologue_end0-.Lprologue_start0
+.Lprologue_start0:
+	.byte	1
+	.byte	1
+	.byte	1
+	.byte	-5
+	.byte	14
+	.byte	1
+	.byte	1
+	.byte	1
+	.byte	8
+	.byte	2
+	.ascii	"/home/ayermolo/local/tasks/T138552329/typeDedupSplit"
+	.byte	0
+	.byte	46
+	.byte	0
+	.byte	3
+	.byte	1
+	.byte	8
+	.byte	2
+	.byte	15
+	.byte	5
+	.byte	30
+	.byte	2
+	.ascii	"main.cpp"
+	.byte	0
+	.byte	0
+	.byte	0x9c, 0x5c, 0xea, 0x5b
+	.byte	0xb7, 0x8d, 0x3f, 0xc2
+	.byte	0x65, 0xcd, 0x17, 0x51
+	.byte	0x10, 0xbf, 0xe9, 0x03
+	.ascii	"header.h"
+	.byte	0
+	.byte	1
+	.byte	0xfe, 0xa7, 0xbb, 0x1f
+	.byte	0x22, 0xc4, 0x7f, 0x12
+	.byte	0x9e, 0x15, 0x69, 0x5f
+	.byte	0x71, 0x37, 0xa1, 0xe7
+.Lprologue_end0:
+.Ldebug_line_end0:
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	0                               # Header: local type unit count
+	.long	2                               # Header: foreign type unit count
+	.long	5                               # Header: bucket count
+	.long	5                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.quad	5322170643381124694             # Type unit 0
+	.quad	1175092228111723119             # Type unit 1
+	.long	0                               # Bucket 0
+	.long	1                               # Bucket 1
+	.long	0                               # Bucket 2
+	.long	3                               # Bucket 3
+	.long	4                               # Bucket 4
+	.long	2090263771                      # Hash in Bucket 1
+	.long	2090499946                      # Hash in Bucket 1
+	.long	193495088                       # Hash in Bucket 3
+	.long	259227804                       # Hash in Bucket 4
+	.long	2090147939                      # Hash in Bucket 4
+	.long	.Lskel_string4                  # String in Bucket 1: Foo2
+	.long	.Lskel_string1                  # String in Bucket 1: main
+	.long	.Lskel_string2                  # String in Bucket 3: int
+	.long	.Lskel_string5                  # String in Bucket 4: Foo2a
+	.long	.Lskel_string3                  # String in Bucket 4: char
+	.long	.Lnames3-.Lnames_entries0       # Offset in Bucket 1
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 1
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 3
+	.long	.Lnames4-.Lnames_entries0       # Offset in Bucket 4
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 4
+.Lnames_abbrev_start0:
+	.ascii	"\350\004"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\354\004"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\310\013"                      # Abbrev code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\210\t"                        # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\214\t"                        # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames3:
+	.ascii	"\354\004"                      # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	33                              # DW_IDX_die_offset
+	.ascii	"\350\004"                      # Abbreviation code
+	.long	104                             # DW_IDX_die_offset
+	.byte	0                               # End of list: Foo2
+.Lnames0:
+	.ascii	"\310\013"                      # Abbreviation code
+	.long	26                              # DW_IDX_die_offset
+	.byte	0                               # End of list: main
+.Lnames1:
+	.ascii	"\210\t"                        # Abbreviation code
+	.long	86                              # DW_IDX_die_offset
+	.byte	0                               # End of list: int
+.Lnames4:
+	.ascii	"\354\004"                      # Abbreviation code
+	.byte	1                               # DW_IDX_type_unit
+	.long	33                              # DW_IDX_die_offset
+	.ascii	"\350\004"                      # Abbreviation code
+	.long	113                             # DW_IDX_die_offset
+	.byte	0                               # End of list: Foo2a
+.Lnames2:
+	.ascii	"\214\t"                        # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	54                              # DW_IDX_die_offset
+	.ascii	"\214\t"                        # Abbreviation code
+	.byte	1                               # DW_IDX_type_unit
+	.long	72                              # DW_IDX_die_offset
+	.ascii	"\210\t"                        # Abbreviation code
+	.long	100                             # DW_IDX_die_offset
+	.byte	0                               # End of list: char
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 18.0.0git (git@github.com:ayermolo/llvm-project.git db35fa8fc524127079662802c4735dbf397f86d0)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/dwarf5-types-debug-names-helper.s b/bolt/test/X86/Inputs/dwarf5-types-debug-names-helper.s
new file mode 100644
index 000000000000..3e89382ed2d8
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-types-debug-names-helper.s
@@ -0,0 +1,405 @@
+# clang++ helper.cpp -g2 -gdwarf-5 -gpubnames -fdebug-types-section
+# header.h
+# struct Foo2a {
+#   char *c1;
+# };
+# helper.cpp
+# #include "header.h"
+# int foo() {
+#   Foo2a f;
+#   return 0;
+# }
+
+
+	.text
+	.file	"helper.cpp"
+	.globl	_Z3foov                         # -- Begin function _Z3foov
+	.p2align	4, 0x90
+	.type	_Z3foov,@function
+_Z3foov:                                # @_Z3foov
+.Lfunc_begin0:
+	.file	0 "/typeDedupSmall" "helper.cpp" md5 0x305ec66c221c583021f8375b300e2591
+	.loc	0 2 0                           # helper.cpp:2:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+.Ltmp0:
+	.loc	0 4 3 prologue_end              # helper.cpp:4:3
+	xorl	%eax, %eax
+	.loc	0 4 3 epilogue_begin is_stmt 0  # helper.cpp:4:3
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	_Z3foov, .Lfunc_end0-_Z3foov
+	.cfi_endproc
+                                        # -- End function
+	.file	1 "." "header.h" md5 0x53699580704254cb1dd2a83230f8a7ea
+	.section	.debug_info,"G",@progbits,1175092228111723119,comdat
+.Ltu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	2                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	1175092228111723119             # Type Signature
+	.long	35                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x18:0x25 DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	2                               # Abbrev [2] 0x23:0x10 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	9                               # DW_AT_name
+	.byte	8                               # DW_AT_byte_size
+	.byte	1                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.byte	3                               # Abbrev [3] 0x29:0x9 DW_TAG_member
+	.byte	7                               # DW_AT_name
+	.long	51                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x33:0x5 DW_TAG_pointer_type
+	.long	56                              # DW_AT_type
+	.byte	5                               # Abbrev [5] 0x38:0x4 DW_TAG_base_type
+	.byte	8                               # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	65                              # DW_TAG_type_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	15                              # DW_TAG_pointer_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	110                             # DW_AT_linkage_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	9                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit
+.Ldebug_info_start1:
+	.short	5                               # DWARF version number
+	.byte	1                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	6                               # Abbrev [6] 0xc:0x41 DW_TAG_compile_unit
+	.byte	0                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	1                               # DW_AT_name
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # DW_AT_comp_dir
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+	.byte	7                               # Abbrev [7] 0x23:0x1c DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	3                               # DW_AT_linkage_name
+	.byte	4                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.long	63                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	8                               # Abbrev [8] 0x33:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	120
+	.byte	6                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.long	67                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	5                               # Abbrev [5] 0x3f:0x4 DW_TAG_base_type
+	.byte	5                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	9                               # Abbrev [9] 0x43:0x9 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	1175092228111723119             # DW_AT_signature
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end1:
+	.section	.debug_str_offsets,"",@progbits
+	.long	44                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 18.0.0git (git@github.com:ayermolo/llvm-project.git db35fa8fc524127079662802c4735dbf397f86d0)" # string offset=0
+.Linfo_string1:
+	.asciz	"helper.cpp"                    # string offset=108
+.Linfo_string2:
+	.asciz	"/home/typeDedupSmall" # string offset=119
+.Linfo_string3:
+	.asciz	"foo"                           # string offset=172
+.Linfo_string4:
+	.asciz	"_Z3foov"                       # string offset=176
+.Linfo_string5:
+	.asciz	"int"                           # string offset=184
+.Linfo_string6:
+	.asciz	"f"                             # string offset=188
+.Linfo_string7:
+	.asciz	"Foo2a"                         # string offset=190
+.Linfo_string8:
+	.asciz	"c1"                            # string offset=196
+.Linfo_string9:
+	.asciz	"char"                          # string offset=199
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Linfo_string0
+	.long	.Linfo_string1
+	.long	.Linfo_string2
+	.long	.Linfo_string4
+	.long	.Linfo_string3
+	.long	.Linfo_string5
+	.long	.Linfo_string6
+	.long	.Linfo_string8
+	.long	.Linfo_string9
+	.long	.Linfo_string7
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	1                               # Header: local type unit count
+	.long	0                               # Header: foreign type unit count
+	.long	5                               # Header: bucket count
+	.long	5                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.long	.Ltu_begin0                     # Type unit 0
+	.long	0                               # Bucket 0
+	.long	0                               # Bucket 1
+	.long	0                               # Bucket 2
+	.long	1                               # Bucket 3
+	.long	2                               # Bucket 4
+	.long	193495088                       # Hash in Bucket 3
+	.long	193491849                       # Hash in Bucket 4
+	.long	259227804                       # Hash in Bucket 4
+	.long	2090147939                      # Hash in Bucket 4
+	.long	-1257882357                     # Hash in Bucket 4
+	.long	.Linfo_string5                  # String in Bucket 3: int
+	.long	.Linfo_string3                  # String in Bucket 4: foo
+	.long	.Linfo_string7                  # String in Bucket 4: Foo2a
+	.long	.Linfo_string9                  # String in Bucket 4: char
+	.long	.Linfo_string4                  # String in Bucket 4: _Z3foov
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 3
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 4
+	.long	.Lnames3-.Lnames_entries0       # Offset in Bucket 4
+	.long	.Lnames4-.Lnames_entries0       # Offset in Bucket 4
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 4
+.Lnames_abbrev_start0:
+	.ascii	"\350\004"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\354\004"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\210\t"                        # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\310\013"                      # Abbrev code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\214\t"                        # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames2:
+	.ascii	"\210\t"                        # Abbreviation code
+	.long	63                              # DW_IDX_die_offset
+	.byte	0                               # End of list: int
+.Lnames0:
+	.ascii	"\310\013"                      # Abbreviation code
+	.long	35                              # DW_IDX_die_offset
+	.byte	0                               # End of list: foo
+.Lnames3:
+	.ascii	"\354\004"                      # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	35                              # DW_IDX_die_offset
+	.ascii	"\350\004"                      # Abbreviation code
+	.long	67                              # DW_IDX_die_offset
+	.byte	0                               # End of list: Foo2a
+.Lnames4:
+	.ascii	"\214\t"                        # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	56                              # DW_IDX_die_offset
+	.byte	0                               # End of list: char
+.Lnames1:
+	.ascii	"\310\013"                      # Abbreviation code
+	.long	35                              # DW_IDX_die_offset
+	.byte	0                               # End of list: _Z3foov
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 18.0.0git (git@github.com:ayermolo/llvm-project.git db35fa8fc524127079662802c4735dbf397f86d0)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/dwarf5-types-debug-names-main.s b/bolt/test/X86/Inputs/dwarf5-types-debug-names-main.s
new file mode 100644
index 000000000000..cb9d287081d7
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-types-debug-names-main.s
@@ -0,0 +1,391 @@
+# clang++ -g2 -gdwarf-5 -gpubnames -fdebug-types-section
+# header.h
+# struct Foo2a {
+#   char *c1;
+# };
+# main.cpp
+# #include "header.h"
+# int main() {
+#  Foo2a f3;
+#  return 0;
+# }
+
+	.text
+	.file	"main.cpp"
+	.globl	main                            # -- Begin function main
+	.p2align	4, 0x90
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.file	0 "/typeDedupSmall" "main.cpp" md5 0x6e787b94dac2f817bb35cfde8006dc82
+	.loc	0 2 0                           # main.cpp:2:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	movl	$0, -4(%rbp)
+.Ltmp0:
+	.loc	0 4 2 prologue_end              # main.cpp:4:2
+	xorl	%eax, %eax
+	.loc	0 4 2 epilogue_begin is_stmt 0  # main.cpp:4:2
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+                                        # -- End function
+	.file	1 "." "header.h" md5 0x53699580704254cb1dd2a83230f8a7ea
+	.section	.debug_info,"G",@progbits,1175092228111723119,comdat
+.Ltu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	2                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	1175092228111723119             # Type Signature
+	.long	35                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x18:0x25 DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	2                               # Abbrev [2] 0x23:0x10 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	8                               # DW_AT_name
+	.byte	8                               # DW_AT_byte_size
+	.byte	1                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.byte	3                               # Abbrev [3] 0x29:0x9 DW_TAG_member
+	.byte	6                               # DW_AT_name
+	.long	51                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x33:0x5 DW_TAG_pointer_type
+	.long	56                              # DW_AT_type
+	.byte	5                               # Abbrev [5] 0x38:0x4 DW_TAG_base_type
+	.byte	7                               # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	65                              # DW_TAG_type_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	15                              # DW_TAG_pointer_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	9                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit
+.Ldebug_info_start1:
+	.short	5                               # DWARF version number
+	.byte	1                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	6                               # Abbrev [6] 0xc:0x40 DW_TAG_compile_unit
+	.byte	0                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	1                               # DW_AT_name
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # DW_AT_comp_dir
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+	.byte	7                               # Abbrev [7] 0x23:0x1b DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	3                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.long	62                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	8                               # Abbrev [8] 0x32:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	112
+	.byte	5                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.long	66                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	5                               # Abbrev [5] 0x3e:0x4 DW_TAG_base_type
+	.byte	4                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	9                               # Abbrev [9] 0x42:0x9 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	1175092228111723119             # DW_AT_signature
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end1:
+	.section	.debug_str_offsets,"",@progbits
+	.long	40                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 18.0.0git (git@github.com:ayermolo/llvm-project.git db35fa8fc524127079662802c4735dbf397f86d0)" # string offset=0
+.Linfo_string1:
+	.asciz	"main.cpp"                      # string offset=108
+.Linfo_string2:
+	.asciz	"/home/typeDedupSmall" # string offset=117
+.Linfo_string3:
+	.asciz	"main"                          # string offset=170
+.Linfo_string4:
+	.asciz	"int"                           # string offset=175
+.Linfo_string5:
+	.asciz	"f3"                            # string offset=179
+.Linfo_string6:
+	.asciz	"Foo2a"                         # string offset=182
+.Linfo_string7:
+	.asciz	"c1"                            # string offset=188
+.Linfo_string8:
+	.asciz	"char"                          # string offset=191
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Linfo_string0
+	.long	.Linfo_string1
+	.long	.Linfo_string2
+	.long	.Linfo_string3
+	.long	.Linfo_string4
+	.long	.Linfo_string5
+	.long	.Linfo_string7
+	.long	.Linfo_string8
+	.long	.Linfo_string6
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	1                               # Header: local type unit count
+	.long	0                               # Header: foreign type unit count
+	.long	4                               # Header: bucket count
+	.long	4                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.long	.Ltu_begin0                     # Type unit 0
+	.long	1                               # Bucket 0
+	.long	0                               # Bucket 1
+	.long	3                               # Bucket 2
+	.long	4                               # Bucket 3
+	.long	193495088                       # Hash in Bucket 0
+	.long	259227804                       # Hash in Bucket 0
+	.long	2090499946                      # Hash in Bucket 2
+	.long	2090147939                      # Hash in Bucket 3
+	.long	.Linfo_string4                  # String in Bucket 0: int
+	.long	.Linfo_string6                  # String in Bucket 0: Foo2a
+	.long	.Linfo_string3                  # String in Bucket 2: main
+	.long	.Linfo_string8                  # String in Bucket 3: char
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 0
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 0
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames3-.Lnames_entries0       # Offset in Bucket 3
+.Lnames_abbrev_start0:
+	.ascii	"\350\004"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\354\004"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\210\t"                        # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\310\013"                      # Abbrev code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\214\t"                        # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames1:
+	.ascii	"\210\t"                        # Abbreviation code
+	.long	62                              # DW_IDX_die_offset
+	.byte	0                               # End of list: int
+.Lnames2:
+	.ascii	"\354\004"                      # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	35                              # DW_IDX_die_offset
+	.ascii	"\350\004"                      # Abbreviation code
+	.long	66                              # DW_IDX_die_offset
+	.byte	0                               # End of list: Foo2a
+.Lnames0:
+	.ascii	"\310\013"                      # Abbreviation code
+	.long	35                              # DW_IDX_die_offset
+	.byte	0                               # End of list: main
+.Lnames3:
+	.ascii	"\214\t"                        # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	56                              # DW_IDX_die_offset
+	.byte	0                               # End of list: char
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 18.0.0git (git@github.com:ayermolo/llvm-project.git db35fa8fc524127079662802c4735dbf397f86d0)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/dwarf5-debug-names-generate-debug-names.test b/bolt/test/X86/dwarf5-debug-names-generate-debug-names.test
new file mode 100644
index 000000000000..b8789a6ad230
--- /dev/null
+++ b/bolt/test/X86/dwarf5-debug-names-generate-debug-names.test
@@ -0,0 +1,154 @@
+; REQUIRES: system-linux
+
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5_main.s -o %tmain.o
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5_helper.s -o %thelper.o
+; RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,-q
+; RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --create-debug-names-section=true
+; RUN: llvm-dwarfdump --debug-info -r 0 --debug-names %t.bolt > %t.txt
+; RUN: cat %t.txt | FileCheck --check-prefix=BOLT %s
+
+;; Tests BOLT generates .debug_names with --create-debug-names-section.
+;; Also applicable when binary has CUs that do not contribute to .debug_names pre-bolt.
+
+; BOLT: [[OFFSET1:0x[0-9a-f]*]]: Compile Unit
+; BOLT: [[OFFSET2:0x[0-9a-f]*]]: Compile Unit
+; BOLT:       Name Index @ 0x0 {
+; BOLT-NEXT:   Header {
+; BOLT-NEXT:     Length: 0x103
+; BOLT-NEXT:     Format: DWARF32
+; BOLT-NEXT:     Version: 5
+; BOLT-NEXT:     CU count: 2
+; BOLT-NEXT:     Local TU count: 0
+; BOLT-NEXT:     Foreign TU count: 0
+; BOLT-NEXT:     Bucket count: 8
+; BOLT-NEXT:     Name count: 8
+; BOLT-NEXT:     Abbreviations table size: 0x19
+; BOLT-NEXT:     Augmentation: 'BOLT'
+; BOLT-NEXT:   }
+; BOLT-NEXT:   Compilation Unit offsets [
+; BOLT-NEXT:     CU[0]: [[OFFSET1]]
+; BOLT-NEXT:     CU[1]: [[OFFSET2]]
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Abbreviations [
+; BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_base_type
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_subprogram
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_variable
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 0 [
+; BOLT-NEXT:     Name 1 {
+; BOLT-NEXT:       Hash: 0xB888030
+; BOLT-NEXT:       String: {{.+}} "int"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000033
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000007f
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 1 [
+; BOLT-NEXT:     Name 2 {
+; BOLT-NEXT:       Hash: 0xB887389
+; BOLT-NEXT:       String: {{.+}} "foo"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000005e
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 3 {
+; BOLT-NEXT:       Hash: 0x8C06E589
+; BOLT-NEXT:       String: {{.+}} "_Z3usePiS_"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000028
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 2 [
+; BOLT-NEXT:     Name 4 {
+; BOLT-NEXT:       Hash: 0xB88B3D2
+; BOLT-NEXT:       String: {{.+}} "use"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000028
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 5 {
+; BOLT-NEXT:       Hash: 0x7C9A7F6A
+; BOLT-NEXT:       String: {{.+}} "main"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000049
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 6 {
+; BOLT-NEXT:       Hash: 0xFDE4B5D2
+; BOLT-NEXT:       String: {{.+}} "fooVar"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_variable
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000028
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 3 [
+; BOLT-NEXT:     Name 7 {
+; BOLT-NEXT:       Hash: 0x7C952063
+; BOLT-NEXT:       String: {{.+}} "char"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000092
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 4 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 5 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 6 [
+; BOLT-NEXT:     Name 8 {
+; BOLT-NEXT:       Hash: 0xB5063CFE
+; BOLT-NEXT:       String: {{.+}} "_Z3fooi"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000005e
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 7 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT: }
diff --git a/bolt/test/X86/dwarf5-debug-names.test b/bolt/test/X86/dwarf5-debug-names.test
new file mode 100644
index 000000000000..de0e88d7a36a
--- /dev/null
+++ b/bolt/test/X86/dwarf5-debug-names.test
@@ -0,0 +1,263 @@
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-debug-names-main.s   -o %tmain.o
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-debug-names-helper.s -o %thelper.o
+; RUN: %clang %cflags -gdwarf-5 %tmain.o %thelper.o -o %tmain.exe
+; RUN: llvm-bolt %tmain.exe -o %tmain.exe.bolt --update-debug-sections
+; RUN: llvm-dwarfdump --debug-info -r 0 --debug-names %tmain.exe.bolt > %tlog.txt
+; RUN: cat %tlog.txt | FileCheck -check-prefix=BOLT %s
+
+;; Tests that BOLT correctly generates .debug_names section with two CUs
+
+; BOLT: [[OFFSET1:0x[0-9a-f]*]]: Compile Unit
+; BOLT: [[OFFSET2:0x[0-9a-f]*]]: Compile Unit
+; BOLT:       Name Index @ 0x0 {
+; BOLT-NEXT:   Header {
+; BOLT-NEXT:     Length: 0x1C2
+; BOLT-NEXT:     Format: DWARF32
+; BOLT-NEXT:     Version: 5
+; BOLT-NEXT:     CU count: 2
+; BOLT-NEXT:     Local TU count: 0
+; BOLT-NEXT:     Foreign TU count: 0
+; BOLT-NEXT:     Bucket count: 14
+; BOLT-NEXT:     Name count: 15
+; BOLT-NEXT:     Abbreviations table size: 0x29
+; BOLT-NEXT:     Augmentation: 'BOLT'
+; BOLT-NEXT:   }
+; BOLT-NEXT:   Compilation Unit offsets [
+; BOLT-NEXT:     CU[0]: [[OFFSET1]]
+; BOLT-NEXT:     CU[1]: [[OFFSET2]]
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Abbreviations [
+; BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_structure_type
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_namespace
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_subprogram
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_base_type
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV5:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_variable
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 0 [
+; BOLT-NEXT:     Name 1 {
+; BOLT-NEXT:       Hash: 0x59796C
+; BOLT-NEXT:       String: {{.+}} "t3"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000002f
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 1 [
+; BOLT-NEXT:     Name 2 {
+; BOLT-NEXT:       Hash: 0x7C96E4DB
+; BOLT-NEXT:       String: {{.+}} "Foo2"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x000000eb
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 2 [
+; BOLT-NEXT:     Name 3 {
+; BOLT-NEXT:       Hash: 0x8CFC710C
+; BOLT-NEXT:       String: {{.+}} "(anonymous namespace)"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_namespace
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000061
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_namespace
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000061
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 4 {
+; BOLT-NEXT:       Hash: 0xBA564846
+; BOLT-NEXT:       String: {{.+}} "Foo2Int"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000005a
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 3 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 4 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 5 [
+; BOLT-NEXT:     Name 5 {
+; BOLT-NEXT:       Hash: 0xB887389
+; BOLT-NEXT:       String: {{.+}} "Foo"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x000000c9
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 6 {
+; BOLT-NEXT:       Hash: 0xB887389
+; BOLT-NEXT:       String: {{.+}} "foo"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000033
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 7 {
+; BOLT-NEXT:       Hash: 0x7C952063
+; BOLT-NEXT:       String: {{.+}} "char"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV4]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000009f
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV4]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x000000c5
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 6 [
+; BOLT-NEXT:     Name 8 {
+; BOLT-NEXT:       Hash: 0x392140FA
+; BOLT-NEXT:       String: {{.+}} "t2<&fooint>"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000003f
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 9 {
+; BOLT-NEXT:       Hash: 0xFDE48034
+; BOLT-NEXT:       String: {{.+}} "fooint"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV5]]
+; BOLT-NEXT:         Tag: DW_TAG_variable
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000024
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 7 [
+; BOLT-NEXT:     Name 10 {
+; BOLT-NEXT:       Hash: 0xB5063D0B
+; BOLT-NEXT:       String: {{.+}} "_Z3foov"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000033
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 8 [
+; BOLT-NEXT:     Name 11 {
+; BOLT-NEXT:       Hash: 0x5979AC
+; BOLT-NEXT:       String: {{.+}} "v1"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV5]]
+; BOLT-NEXT:         Tag: DW_TAG_variable
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000024
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 9 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 10 [
+; BOLT-NEXT:     Name 12 {
+; BOLT-NEXT:       Hash: 0xB888030
+; BOLT-NEXT:       String: {{.+}} "int"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV4]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000002f
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV4]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000005d
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 13 {
+; BOLT-NEXT:       Hash: 0xF73809C
+; BOLT-NEXT:       String: {{.+}} "Foo2a"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000078
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000104
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 14 {
+; BOLT-NEXT:       Hash: 0x7C9A7F6A
+; BOLT-NEXT:       String: {{.+}} "main"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000073
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 11 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 12 [
+; BOLT-NEXT:     Name 15 {
+; BOLT-NEXT:       Hash: 0x59796A
+; BOLT-NEXT:       String: {{.+}} "t1"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000062
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 13 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT: }
diff --git a/bolt/test/X86/dwarf5-df-debug-names-generate-debug-names.test b/bolt/test/X86/dwarf5-df-debug-names-generate-debug-names.test
new file mode 100644
index 000000000000..3be327b780e5
--- /dev/null
+++ b/bolt/test/X86/dwarf5-df-debug-names-generate-debug-names.test
@@ -0,0 +1,192 @@
+; RUN: rm -rf %t
+; RUN: mkdir %t
+; RUN: cd %t
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-dualcu-main.s \
+; RUN: -split-dwarf-file=main.dwo -o main.o
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-dualcu-helper.s \
+; RUN: -split-dwarf-file=helper.dwo -o helper.o
+; RUN: %clang %cflags -gdwarf-5 -gsplit-dwarf=split main.o helper.o -o main.exe -fno-pic -no-pie
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --create-debug-names-section=true
+; RUN: llvm-dwarfdump --debug-info --debug-names main.exe.bolt > %t/foo.txt
+; RUN: cat %t/foo.txt | FileCheck -check-prefix=BOLT %s
+
+;; Tests BOLT generates .debug_names with --create-debug-names-section.
+;; Also applicable when binary has split dwarf CUs that do not contribute to .debug_names pre-bolt.
+
+; BOLT: [[OFFSET1:0x[0-9a-f]*]]: Compile Unit
+; BOLT: [[OFFSET2:0x[0-9a-f]*]]: Compile Unit
+; BOLT:       Name Index @ 0x0 {
+; BOLT-NEXT:   Header {
+; BOLT-NEXT:     Length: 0x148
+; BOLT-NEXT:     Format: DWARF32
+; BOLT-NEXT:     Version: 5
+; BOLT-NEXT:     CU count: 2
+; BOLT-NEXT:     Local TU count: 0
+; BOLT-NEXT:     Foreign TU count: 0
+; BOLT-NEXT:     Bucket count: 11
+; BOLT-NEXT:     Name count: 11
+; BOLT-NEXT:     Abbreviations table size: 0x19
+; BOLT-NEXT:     Augmentation: 'BOLT'
+; BOLT-NEXT:   }
+; BOLT-NEXT:   Compilation Unit offsets [
+; BOLT-NEXT:     CU[0]: [[OFFSET1]]
+; BOLT-NEXT:     CU[1]: [[OFFSET2]]
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Abbreviations [
+; BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_variable
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_base_type
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_subprogram
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 0 [
+; BOLT-NEXT:     Name 1 {
+; BOLT-NEXT:       Hash: 0x2B61E
+; BOLT-NEXT:       String: {{.+}} "y"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_variable
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000029
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 2 {
+; BOLT-NEXT:       Hash: 0x7C952063
+; BOLT-NEXT:       String: {{.+}} "char"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000008c
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 1 [
+; BOLT-NEXT:     Name 3 {
+; BOLT-NEXT:       Hash: 0x2B609
+; BOLT-NEXT:       String: {{.+}} "d"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_variable
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000029
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 4 {
+; BOLT-NEXT:       Hash: 0x2B61F
+; BOLT-NEXT:       String: {{.+}} "z"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_variable
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 2 [
+; BOLT-NEXT:     Name 5 {
+; BOLT-NEXT:       Hash: 0xB88B3D2
+; BOLT-NEXT:       String: {{.+}} "use"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000034
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 3 [
+; BOLT-NEXT:     Name 6 {
+; BOLT-NEXT:       Hash: 0x45A3B006
+; BOLT-NEXT:       String: {{.+}} "_Z6helperii"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000034
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 4 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 5 [
+; BOLT-NEXT:     Name 7 {
+; BOLT-NEXT:       Hash: 0x8C06E589
+; BOLT-NEXT:       String: {{.+}} "_Z3usePiS_"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000034
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 6 [
+; BOLT-NEXT:     Name 8 {
+; BOLT-NEXT:       Hash: 0xB888030
+; BOLT-NEXT:       String: {{.+}} "int"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000025
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000025
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 7 [
+; BOLT-NEXT:     Name 9 {
+; BOLT-NEXT:       Hash: 0x1D853E5
+; BOLT-NEXT:       String: {{.+}} "helper"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000034
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 10 {
+; BOLT-NEXT:       Hash: 0x7C9A7F6A
+; BOLT-NEXT:       String: {{.+}} "main"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000057
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 8 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 9 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 10 [
+; BOLT-NEXT:     Name 11 {
+; BOLT-NEXT:       Hash: 0x2B61D
+; BOLT-NEXT:       String: {{.+}} "x"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_variable
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
diff --git a/bolt/test/X86/dwarf5-df-debug-names.test b/bolt/test/X86/dwarf5-df-debug-names.test
new file mode 100644
index 000000000000..0352d0ff7208
--- /dev/null
+++ b/bolt/test/X86/dwarf5-df-debug-names.test
@@ -0,0 +1,149 @@
+; RUN: rm -rf %t
+; RUN: mkdir %t
+; RUN: cd %t
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-debug-names-main.s \
+; RUN: -split-dwarf-file=main.dwo -o main.o
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-debug-names-helper.s \
+; RUN: -split-dwarf-file=helper.dwo -o helper.o
+; RUN: %clang %cflags -gdwarf-5 -gsplit-dwarf=split main.o helper.o -o main.exe
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-dwarfdump --debug-info --debug-names main.exe.bolt > log.txt
+; RUN: cat log.txt | FileCheck -check-prefix=BOLT %s
+
+;; Tests that BOLT correctly generates .debug_names section with two CUs for split dwarf.
+
+; BOLT: [[OFFSET:0x[0-9a-f]*]]: Compile Unit
+; BOLT: [[OFFSET1:0x[0-9a-f]*]]: Compile Unit
+: BOLT:       Name Index @ 0x0 {
+: BOLT-NEXT:   Header {
+: BOLT-NEXT:     Length: 0xF4
+: BOLT-NEXT:     Format: DWARF32
+: BOLT-NEXT:     Version: 5
+: BOLT-NEXT:     CU count: 2
+: BOLT-NEXT:     Local TU count: 0
+: BOLT-NEXT:     Foreign TU count: 0
+: BOLT-NEXT:     Bucket count: 7
+: BOLT-NEXT:     Name count: 7
+: BOLT-NEXT:     Abbreviations table size: 0x21
+: BOLT-NEXT:     Augmentation: 'BOLT'
+: BOLT-NEXT:   }
+: BOLT-NEXT:   Compilation Unit offsets [
+: BOLT-NEXT:     CU[0]: [[OFFSET]]
+: BOLT-NEXT:     CU[1]: [[OFFSET1]]
+: BOLT-NEXT:   ]
+: BOLT-NEXT:   Abbreviations [
+: BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+: BOLT-NEXT:       Tag: DW_TAG_structure_type
+: BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+: BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+: BOLT-NEXT:     }
+: BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+: BOLT-NEXT:       Tag: DW_TAG_base_type
+: BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+: BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+: BOLT-NEXT:     }
+: BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+: BOLT-NEXT:       Tag: DW_TAG_variable
+: BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+: BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+: BOLT-NEXT:     }
+: BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
+: BOLT-NEXT:       Tag: DW_TAG_subprogram
+: BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+: BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+: BOLT-NEXT:     }
+: BOLT-NEXT:   ]
+: BOLT-NEXT:   Bucket 0 [
+: BOLT-NEXT:     EMPTY
+: BOLT-NEXT:   ]
+: BOLT-NEXT:   Bucket 1 [
+: BOLT-NEXT:     Name 1 {
+: BOLT-NEXT:       Hash: 0x7C96E4DB
+: BOLT-NEXT:       String: {{.+}} "Foo2"
+: BOLT-NEXT:       Entry @ {{.+}} {
+: BOLT-NEXT:         Abbrev: [[ABBREV1]]
+: BOLT-NEXT:         Tag: DW_TAG_structure_type
+: BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+: BOLT-NEXT:         DW_IDX_die_offset: 0x00000068
+: BOLT-NEXT:       }
+: BOLT-NEXT:     }
+: BOLT-NEXT:   ]
+: BOLT-NEXT:   Bucket 2 [
+: BOLT-NEXT:     Name 2 {
+: BOLT-NEXT:       Hash: 0xBA564846
+: BOLT-NEXT:       String: {{.+}} "Foo2Int"
+: BOLT-NEXT:       Entry @ {{.+}} {
+: BOLT-NEXT:         Abbrev: [[ABBREV1]]
+: BOLT-NEXT:         Tag: DW_TAG_structure_type
+: BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+: BOLT-NEXT:         DW_IDX_die_offset: 0x00000025
+: BOLT-NEXT:       }
+: BOLT-NEXT:     }
+: BOLT-NEXT:   ]
+: BOLT-NEXT:   Bucket 3 [
+: BOLT-NEXT:     Name 3 {
+: BOLT-NEXT:       Hash: 0xB888030
+: BOLT-NEXT:       String: {{.+}} "int"
+: BOLT-NEXT:       Entry @ {{.+}} {
+: BOLT-NEXT:         Abbrev: [[ABBREV2]]
+: BOLT-NEXT:         Tag: DW_TAG_base_type
+: BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+: BOLT-NEXT:         DW_IDX_die_offset: 0x00000043
+: BOLT-NEXT:       }
+: BOLT-NEXT:       Entry @ {{.+}} {
+: BOLT-NEXT:         Abbrev: [[ABBREV2]]
+: BOLT-NEXT:         Tag: DW_TAG_base_type
+: BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+: BOLT-NEXT:         DW_IDX_die_offset: 0x00000056
+: BOLT-NEXT:       }
+: BOLT-NEXT:     }
+: BOLT-NEXT:     Name 4 {
+: BOLT-NEXT:       Hash: 0xF73809C
+: BOLT-NEXT:       String: {{.+}} "Foo2a"
+: BOLT-NEXT:       Entry @ {{.+}} {
+: BOLT-NEXT:         Abbrev: [[ABBREV1]]
+: BOLT-NEXT:         Tag: DW_TAG_structure_type
+: BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+: BOLT-NEXT:         DW_IDX_die_offset: 0x00000078
+: BOLT-NEXT:       }
+: BOLT-NEXT:     }
+: BOLT-NEXT:     Name 5 {
+: BOLT-NEXT:       Hash: 0x7C96CB76
+: BOLT-NEXT:       String: {{.+}} "fint"
+: BOLT-NEXT:       Entry @ {{.+}} {
+: BOLT-NEXT:         Abbrev: [[ABBREV3]]
+: BOLT-NEXT:         Tag: DW_TAG_variable
+: BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+: BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+: BOLT-NEXT:       }
+: BOLT-NEXT:     }
+: BOLT-NEXT:     Name 6 {
+: BOLT-NEXT:       Hash: 0x7C9A7F6A
+: BOLT-NEXT:       String: {{.+}} "main"
+: BOLT-NEXT:       Entry @ {{.+}} {
+: BOLT-NEXT:         Abbrev: [[ABBREV4]]
+: BOLT-NEXT:         Tag: DW_TAG_subprogram
+: BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+: BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+: BOLT-NEXT:       }
+: BOLT-NEXT:     }
+: BOLT-NEXT:   ]
+: BOLT-NEXT:   Bucket 4 [
+: BOLT-NEXT:     EMPTY
+: BOLT-NEXT:   ]
+: BOLT-NEXT:   Bucket 5 [
+: BOLT-NEXT:     Name 7 {
+: BOLT-NEXT:       Hash: 0x7C952063
+: BOLT-NEXT:       String: {{.+}} "char"
+: BOLT-NEXT:       Entry @ {{.+}} {
+: BOLT-NEXT:         Abbrev: [[ABBREV2]]
+: BOLT-NEXT:         Tag: DW_TAG_base_type
+: BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+: BOLT-NEXT:         DW_IDX_die_offset: 0x00000064
+: BOLT-NEXT:       }
+: BOLT-NEXT:     }
+: BOLT-NEXT:   ]
+: BOLT-NEXT:   Bucket 6 [
+: BOLT-NEXT:     EMPTY
+: BOLT-NEXT:   ]
+: BOLT-NEXT: }
diff --git a/bolt/test/X86/dwarf5-df-one-cu-debug-names.test b/bolt/test/X86/dwarf5-df-one-cu-debug-names.test
new file mode 100644
index 000000000000..246ce7efb3ba
--- /dev/null
+++ b/bolt/test/X86/dwarf5-df-one-cu-debug-names.test
@@ -0,0 +1,101 @@
+; RUN: rm -rf %t
+; RUN: mkdir %t
+; RUN: cd %t
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-debug-names-main.s \
+; RUN: -split-dwarf-file=main.dwo -o main.o
+; RUN: %clang %cflags -gdwarf-5 -gsplit-dwarf=split main.o -o main.exe
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-dwarfdump --debug-info --debug-names main.exe.bolt > log.txt
+; RUN: cat log.txt | FileCheck -check-prefix=BOLT %s
+
+;; Tests that BOLT correctly generates .debug_names section with one CU for split dwarf.
+
+; BOLT: [[OFFSET:0x[0-9a-f]*]]: Compile Unit
+; BOLT:       Name Index @ 0x0 {
+; BOLT-NEXT:   Header {
+; BOLT-NEXT:     Length: 0xA9
+; BOLT-NEXT:     Format: DWARF32
+; BOLT-NEXT:     Version: 5
+; BOLT-NEXT:     CU count: 1
+; BOLT-NEXT:     Local TU count: 0
+; BOLT-NEXT:     Foreign TU count: 0
+; BOLT-NEXT:     Bucket count: 5
+; BOLT-NEXT:     Name count: 5
+; BOLT-NEXT:     Abbreviations table size: 0x13
+; BOLT-NEXT:     Augmentation: 'BOLT'
+; BOLT-NEXT:   }
+; BOLT-NEXT:   Compilation Unit offsets [
+; BOLT-NEXT:     CU[0]: 0x00000000
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Abbreviations [
+; BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_structure_type
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_subprogram
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_base_type
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 0 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 1 [
+; BOLT-NEXT:     Name 1 {
+; BOLT-NEXT:       Hash: 0x7C96E4DB
+; BOLT-NEXT:       String: {{.+}} "Foo2"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000068
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 2 {
+; BOLT-NEXT:       Hash: 0x7C9A7F6A
+; BOLT-NEXT:       String: {{.+}} "main"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 2 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 3 [
+; BOLT-NEXT:     Name 3 {
+; BOLT-NEXT:       Hash: 0xB888030
+; BOLT-NEXT:       String: {{.+}} "int"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000056
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 4 [
+; BOLT-NEXT:     Name 4 {
+; BOLT-NEXT:       Hash: 0xF73809C
+; BOLT-NEXT:       String: {{.+}} "Foo2a"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000078
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 5 {
+; BOLT-NEXT:       Hash: 0x7C952063
+; BOLT-NEXT:       String: {{.+}} "char"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000064
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT: }
diff --git a/bolt/test/X86/dwarf5-df-types-debug-names.test b/bolt/test/X86/dwarf5-df-types-debug-names.test
new file mode 100644
index 000000000000..fdb962b80c75
--- /dev/null
+++ b/bolt/test/X86/dwarf5-df-types-debug-names.test
@@ -0,0 +1,234 @@
+; RUN: rm -rf %t
+; RUN: mkdir %t
+; RUN: cd %t
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-types-debug-names-main.s \
+; RUN: -split-dwarf-file=main.dwo -o main.o
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-types-debug-names-helper.s \
+; RUN: -split-dwarf-file=helper.dwo -o helper.o
+; RUN: %clang %cflags -gdwarf-5 -gsplit-dwarf=split main.o helper.o -o main.exe
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-dwarfdump --debug-info -r 0 main.dwo.dwo > log.txt
+; RUN: llvm-dwarfdump --debug-info -r 0 helper.dwo.dwo >> log.txt
+; RUN: llvm-dwarfdump --debug-info --debug-names main.exe.bolt >> log.txt
+; RUN: cat log.txt | FileCheck -check-prefix=BOLT %s
+
+;; Tests that BOLT correctly generates .debug_names section with two CUs and foreign TUs.
+
+; BOLT: type_signature = [[TYPE:0x[0-9a-f]*]]
+; BOLT: type_signature = [[TYPE1:0x[0-9a-f]*]]
+; BOLT: Compile Unit
+; BOLT: type_signature = [[TYPE2:0x[0-9a-f]*]]
+; BOLT: type_signature = [[TYPE3:0x[0-9a-f]*]]
+; BOLT: Compile Unit
+; BOLT: [[OFFSET:0x[0-9a-f]*]]: Compile Unit
+; BOLT: [[OFFSET1:0x[0-9a-f]*]]: Compile Unit
+
+; BOLT:       Name Index @ 0x0 {
+; BOLT-NEXT:   Header {
+; BOLT-NEXT:     Length: 0x174
+; BOLT-NEXT:     Format: DWARF32
+; BOLT-NEXT:     Version: 5
+; BOLT-NEXT:     CU count: 2
+; BOLT-NEXT:     Local TU count: 0
+; BOLT-NEXT:     Foreign TU count: 4
+; BOLT-NEXT:     Bucket count: 9
+; BOLT-NEXT:     Name count: 9
+; BOLT-NEXT:     Abbreviations table size: 0x2D
+; BOLT-NEXT:     Augmentation: 'BOLT'
+; BOLT-NEXT:   }
+; BOLT-NEXT:   Compilation Unit offsets [
+; BOLT-NEXT:     CU[0]: [[OFFSET]]
+; BOLT-NEXT:     CU[1]: [[OFFSET1]]
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Foreign Type Unit signatures [
+; BOLT-NEXT:     ForeignTU[0]: [[TYPE]]
+; BOLT-NEXT:     ForeignTU[1]: [[TYPE1]]
+; BOLT-NEXT:     ForeignTU[2]: [[TYPE2]]
+; BOLT-NEXT:     ForeignTU[3]: [[TYPE3]]
+; BOLT-NEXT:   ]
+; BOLT-NEXT: Abbreviations [
+; BOLT-NEXT:     Abbreviation [[ABBREV:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_structure_type
+; BOLT-NEXT:       DW_IDX_type_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_subprogram
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_variable
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_base_type
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_base_type
+; BOLT-NEXT:       DW_IDX_type_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 0 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 1 [
+; BOLT-NEXT:     Name 1 {
+; BOLT-NEXT:       Hash: 0x7C96E4DB
+; BOLT-NEXT:       String: {{.+}} "Foo2"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_type_unit: 0x00
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 2 {
+; BOLT-NEXT:       Hash: 0xB5063D0B
+; BOLT-NEXT:       String: {{.+}} "_Z3foov"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000029
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 3 {
+; BOLT-NEXT:       Hash: 0xFDE48034
+; BOLT-NEXT:       String: {{.+}} "fooint"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_variable
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 2 [
+; BOLT-NEXT:     Name 4 {
+; BOLT-NEXT:       Hash: 0xB888030
+; BOLT-NEXT:       String: {{.+}} "int"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000025
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: 0x5
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_type_unit: 0x02
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000003f
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000056
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 3 [
+; BOLT-NEXT:     Name 5 {
+; BOLT-NEXT:       Hash: 0xB887389
+; BOLT-NEXT:       String: {{.+}} "foo"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000029
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 6 {
+; BOLT-NEXT:       Hash: 0xF73809C
+; BOLT-NEXT:       String: {{.+}} "Foo2a"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_type_unit: 0x01
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_type_unit: 0x03
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 7 {
+; BOLT-NEXT:       Hash: 0xBA564846
+; BOLT-NEXT:       String: {{.+}} "Foo2Int"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_type_unit: 0x02
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 4 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 5 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 6 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 7 [
+; BOLT-NEXT:     Name 8 {
+; BOLT-NEXT:       Hash: 0x7C9A7F6A
+; BOLT-NEXT:       String: {{.+}} "main"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 8 [
+; BOLT-NEXT:     Name 9 {
+; BOLT-NEXT:       Hash: 0x7C952063
+; BOLT-NEXT:       String: {{.+}} "char"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: 0x5
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_type_unit: 0x00
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000036
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: 0x5
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_type_unit: 0x01
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000048
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: 0x5
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_type_unit: 0x03
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000048
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000064
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT: }
diff --git a/bolt/test/X86/dwarf5-df-types-one-cu-debug-names.test b/bolt/test/X86/dwarf5-df-types-one-cu-debug-names.test
new file mode 100644
index 000000000000..1c77583f5088
--- /dev/null
+++ b/bolt/test/X86/dwarf5-df-types-one-cu-debug-names.test
@@ -0,0 +1,129 @@
+; RUN: rm -rf %t
+; RUN: mkdir %t
+; RUN: cd %t
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-types-debug-names-main.s \
+; RUN: -split-dwarf-file=main.dwo -o main.o
+; RUN: %clang %cflags -gdwarf-5 -gsplit-dwarf=split main.o -o main.exe
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-dwarfdump --debug-info -r 0 main.dwo.dwo > log.txt
+; RUN: llvm-dwarfdump --debug-info --debug-names main.exe.bolt >> log.txt
+; RUN: cat log.txt | FileCheck -check-prefix=BOLT %s
+
+;; Tests that BOLT correctly generates .debug_names section with one CU and foreign TUs.
+
+; BOLT: type_signature = [[TYPE:0x[0-9a-f]*]]
+; BOLT: type_signature = [[TYPE1:0x[0-9a-f]*]]
+; BOLT: Compile Unit
+; BOLT: [[OFFSET:0x[0-9a-f]*]]: Compile Unit
+; BOLT:       Name Index @ 0x0 {
+; BOLT-NEXT:   Header {
+; BOLT-NEXT:     Length: 0xD1
+; BOLT-NEXT:     Format: DWARF32
+; BOLT-NEXT:     Version: 5
+; BOLT-NEXT:     CU count: 1
+; BOLT-NEXT:     Local TU count: 0
+; BOLT-NEXT:     Foreign TU count: 2
+; BOLT-NEXT:     Bucket count: 5
+; BOLT-NEXT:     Name count: 5
+; BOLT-NEXT:     Abbreviations table size: 0x1D
+; BOLT-NEXT:     Augmentation: 'BOLT'
+; BOLT-NEXT:   }
+; BOLT-NEXT:   Compilation Unit offsets [
+; BOLT-NEXT:     CU[0]: [[OFFSET]]
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Foreign Type Unit signatures [
+; BOLT-NEXT:     ForeignTU[0]: [[TYPE]]
+; BOLT-NEXT:     ForeignTU[1]: [[TYPE1]]
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Abbreviations [
+; BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_structure_type
+; BOLT-NEXT:       DW_IDX_type_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_subprogram
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_base_type
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_base_type
+; BOLT-NEXT:       DW_IDX_type_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 0 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 1 [
+; BOLT-NEXT:     Name 1 {
+; BOLT-NEXT:       Hash: 0x7C96E4DB
+; BOLT-NEXT:       String: {{.+}} "Foo2"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_type_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 2 {
+; BOLT-NEXT:       Hash: 0x7C9A7F6A
+; BOLT-NEXT:       String: {{.+}} "main"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 2 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 3 [
+; BOLT-NEXT:     Name 3 {
+; BOLT-NEXT:       Hash: 0xB888030
+; BOLT-NEXT:       String: {{.+}} "int"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000056
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 4 [
+; BOLT-NEXT:     Name 4 {
+; BOLT-NEXT:       Hash: 0xF73809C
+; BOLT-NEXT:       String: {{.+}} "Foo2a"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_type_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 5 {
+; BOLT-NEXT:       Hash: 0x7C952063
+; BOLT-NEXT:       String: {{.+}} "char"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV4]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_type_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000036
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV4]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_type_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000048
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000064
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT: }
diff --git a/bolt/test/X86/dwarf5-one-cu-debug-names.test b/bolt/test/X86/dwarf5-one-cu-debug-names.test
new file mode 100644
index 000000000000..e7754521d61c
--- /dev/null
+++ b/bolt/test/X86/dwarf5-one-cu-debug-names.test
@@ -0,0 +1,178 @@
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-debug-names-main.s   -o %tmain.o
+; RUN: %clang %cflags -gdwarf-5 %tmain.o -o %tmain.exe
+; RUN: llvm-bolt %tmain.exe -o %tmain.exe.bolt --update-debug-sections
+; RUN: llvm-dwarfdump --debug-info -r 0 --debug-names %tmain.exe.bolt > %tlog.txt
+; RUN: cat %tlog.txt | FileCheck -check-prefix=BOLT %s
+
+;; Tests that BOLT correctly generates .debug_names section with one CUs
+
+; BOLT: [[OFFSET1:0x[0-9a-f]*]]: Compile Unit
+; BOLT:       Name Index @ 0x0 {
+; BOLT-NEXT:   Header {
+; BOLT-NEXT:     Length: 0x13E
+; BOLT-NEXT:     Format: DWARF32
+; BOLT-NEXT:     Version: 5
+; BOLT-NEXT:     CU count: 1
+; BOLT-NEXT:     Local TU count: 0
+; BOLT-NEXT:     Foreign TU count: 0
+; BOLT-NEXT:     Bucket count: 11
+; BOLT-NEXT:     Name count: 11
+; BOLT-NEXT:     Abbreviations table size: 0x1F
+; BOLT-NEXT:     Augmentation: 'BOLT'
+; BOLT-NEXT:   }
+; BOLT-NEXT:   Compilation Unit offsets [
+; BOLT-NEXT:     CU[0]: [[OFFSET1]]
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Abbreviations [
+; BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_structure_type
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_base_type
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_variable
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_subprogram
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV5:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_namespace
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 0 [
+; BOLT-NEXT:     Name 1 {
+; BOLT-NEXT:       Hash: 0xF73809C
+; BOLT-NEXT:       String: {{.+}} "Foo2a"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000104
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 2 {
+; BOLT-NEXT:       Hash: 0x7C952063
+; BOLT-NEXT:       String: {{.+}} "char"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x000000c5
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 1 [
+; BOLT-NEXT:     Name 3 {
+; BOLT-NEXT:       Hash: 0xB887389
+; BOLT-NEXT:       String: {{.+}} "Foo"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x000000c9
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 4 {
+; BOLT-NEXT:       Hash: 0x392140FA
+; BOLT-NEXT:       String: {{.+}} "t2<&fooint>"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000003f
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 2 [
+; BOLT-NEXT:     Name 5 {
+; BOLT-NEXT:       Hash: 0x7C96E4DB
+; BOLT-NEXT:       String: {{.+}} "Foo2"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x000000eb
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 3 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 4 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 5 [
+; BOLT-NEXT:     Name 6 {
+; BOLT-NEXT:       Hash: 0x59796A
+; BOLT-NEXT:       String: {{.+}} "t1"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000062
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 7 {
+; BOLT-NEXT:       Hash: 0x5979AC
+; BOLT-NEXT:       String: {{.+}} "v1"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_variable
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000024
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 6 [
+; BOLT-NEXT:     Name 8 {
+; BOLT-NEXT:       Hash: 0xB888030
+; BOLT-NEXT:       String: {{.+}} "int"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000005d
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 7 [
+; BOLT-NEXT:     Name 9 {
+; BOLT-NEXT:       Hash: 0x59796C
+; BOLT-NEXT:       String: {{.+}} "t3"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000002f
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 10 {
+; BOLT-NEXT:       Hash: 0x7C9A7F6A
+; BOLT-NEXT:       String: {{.+}} "main"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV4]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000073
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 8 [
+; BOLT-NEXT:     Name 11 {
+; BOLT-NEXT:       Hash: 0x8CFC710C
+; BOLT-NEXT:       String: {{.+}} "(anonymous namespace)"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV5]]
+; BOLT-NEXT:         Tag: DW_TAG_namespace
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000061
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV5]]
+; BOLT-NEXT:         Tag: DW_TAG_namespace
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000061
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 9 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 10 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT: }
diff --git a/bolt/test/X86/dwarf5-types-debug-names.test b/bolt/test/X86/dwarf5-types-debug-names.test
new file mode 100644
index 000000000000..6a26477d4cdb
--- /dev/null
+++ b/bolt/test/X86/dwarf5-types-debug-names.test
@@ -0,0 +1,129 @@
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-types-debug-names-main.s   -o %tmain.o
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-types-debug-names-helper.s -o %thelper.o
+; RUN: %clang %cflags -gdwarf-5 %tmain.o %thelper.o -o %tmain.exe
+; RUN: llvm-bolt %tmain.exe -o %tmain.exe.bolt --update-debug-sections
+; RUN: llvm-dwarfdump --debug-info --debug-names %tmain.exe.bolt > %tlog.txt
+; RUN: cat %tlog.txt | FileCheck -check-prefix=BOLT %s
+
+;; Tests that BOLT correctly generates .debug_names section with two CUs and a local TU.
+
+; BOLT: [[OFFSET:0x[0-9a-f]*]]: Type Unit
+; BOLT: [[OFFSET1:0x[0-9a-f]*]]: Compile Unit
+; BOLT: [[OFFSET2:0x[0-9a-f]*]]: Compile Unit
+
+
+; BOLT: Name Index @ 0x0 {
+; BOLT:   Header {
+; BOLT:     Length: 0xE1
+; BOLT:     Format: DWARF32
+; BOLT:     Version: 5
+; BOLT:     CU count: 2
+; BOLT:     Local TU count: 1
+; BOLT:     Foreign TU count: 0
+; BOLT:     Bucket count: 6
+; BOLT:     Name count: 6
+; BOLT:     Abbreviations table size: 0x21
+; BOLT:     Augmentation: 'BOLT'
+; BOLT:   }
+; BOLT:   Compilation Unit offsets [
+; BOLT:     CU[0]: [[OFFSET1]]
+; BOLT:     CU[1]: [[OFFSET2]]
+; BOLT:   ]
+; BOLT:   Local Type Unit offsets [
+; BOLT:     LocalTU[0]: [[OFFSET]]
+; BOLT:   ]
+; BOLT:   Abbreviations [
+; BOLT:     Abbreviation [[ABBREV:0x[0-9a-f]*]] {
+; BOLT:       Tag: DW_TAG_structure_type
+; BOLT:       DW_IDX_type_unit: DW_FORM_data1
+; BOLT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT:     }
+; BOLT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+; BOLT:       Tag: DW_TAG_subprogram
+; BOLT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT:     }
+; BOLT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+; BOLT:       Tag: DW_TAG_base_type
+; BOLT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT:     }
+; BOLT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+; BOLT:       Tag: DW_TAG_base_type
+; BOLT:       DW_IDX_type_unit: DW_FORM_data1
+; BOLT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT:     }
+; BOLT:   ]
+; BOLT:   Bucket 0 [
+; BOLT:     Name 1 {
+; BOLT:       Hash: 0xF73809C
+; BOLT:       String: {{.+}} "Foo2a"
+; BOLT:       Entry @ {{.+}} {
+; BOLT:         Abbrev: [[ABBREV]]
+; BOLT:         Tag: DW_TAG_structure_type
+; BOLT:         DW_IDX_type_unit: 0x00
+; BOLT:         DW_IDX_die_offset: 0x00000023
+; BOLT:       }
+; BOLT:     }
+; BOLT:   ]
+; BOLT:   Bucket 1 [
+; BOLT:     Name 2 {
+; BOLT:       Hash: 0xB5063D0B
+; BOLT:       String: {{.+}} "_Z3foov"
+; BOLT:       Entry @ {{.+}} {
+; BOLT:         Abbrev: [[ABBREV1]]
+; BOLT:         Tag: DW_TAG_subprogram
+; BOLT:         DW_IDX_compile_unit: 0x01
+; BOLT:         DW_IDX_die_offset: 0x00000024
+; BOLT:       }
+; BOLT:     }
+; BOLT:   ]
+; BOLT:   Bucket 2 [
+; BOLT:     Name 3 {
+; BOLT:       Hash: 0xB888030
+; BOLT:       String: {{.+}} "int"
+; BOLT:       Entry @ {{.+}} {
+; BOLT:         Abbrev: [[ABBREV2]]
+; BOLT:         Tag: DW_TAG_base_type
+; BOLT:         DW_IDX_compile_unit: 0x01
+; BOLT:         DW_IDX_die_offset: 0x00000040
+; BOLT:       }
+; BOLT:     }
+; BOLT:   ]
+; BOLT:   Bucket 3 [
+; BOLT:     Name 4 {
+; BOLT:       Hash: 0xB887389
+; BOLT:       String: {{.+}} "foo"
+; BOLT:       Entry @ {{.+}} {
+; BOLT:         Abbrev: [[ABBREV1]]
+; BOLT:         Tag: DW_TAG_subprogram
+; BOLT:         DW_IDX_compile_unit: 0x01
+; BOLT:         DW_IDX_die_offset: 0x00000024
+; BOLT:       }
+; BOLT:     }
+; BOLT:   ]
+; BOLT:   Bucket 4 [
+; BOLT:     Name 5 {
+; BOLT:       Hash: 0x7C9A7F6A
+; BOLT:       String: {{.+}} "main"
+; BOLT:       Entry @ {{.+}} {
+; BOLT:         Abbrev: [[ABBREV1]]
+; BOLT:         Tag: DW_TAG_subprogram
+; BOLT:         DW_IDX_compile_unit: 0x00
+; BOLT:         DW_IDX_die_offset: 0x00000024
+; BOLT:       }
+; BOLT:     }
+; BOLT:   ]
+; BOLT:   Bucket 5 [
+; BOLT:     Name 6 {
+; BOLT:       Hash: 0x7C952063
+; BOLT:       String: {{.+}} "char"
+; BOLT:       Entry @ {{.+}} {
+; BOLT:         Abbrev: [[ABBREV3]]
+; BOLT:         Tag: DW_TAG_base_type
+; BOLT:         DW_IDX_type_unit: 0x00
+; BOLT:         DW_IDX_die_offset: 0x00000038
+; BOLT:       }
+; BOLT:     }
+; BOLT:   ]
+; BOLT: }
diff --git a/bolt/test/X86/dwarf5-types-one-cu-debug-names.test b/bolt/test/X86/dwarf5-types-one-cu-debug-names.test
new file mode 100644
index 000000000000..00a1319a5578
--- /dev/null
+++ b/bolt/test/X86/dwarf5-types-one-cu-debug-names.test
@@ -0,0 +1,98 @@
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-types-debug-names-main.s   -o %tmain.o
+; RUN: %clang %cflags -gdwarf-5 %tmain.o -o %tmain.exe
+; RUN: llvm-bolt %tmain.exe -o %tmain.exe.bolt --update-debug-sections
+; RUN: llvm-dwarfdump --debug-info --debug-names %tmain.exe.bolt > %tlog.txt
+; RUN: cat %tlog.txt | FileCheck -check-prefix=BOLT %s
+
+;; Tests that BOLT correctly generates .debug_names section with one CU and a local TU.
+
+; BOLT: [[OFFSET:0x[0-9a-f]*]]: Type Unit
+; BOLT: [[OFFSET1:0x[0-9a-f]*]]: Compile Unit
+
+; BOLT:Name Index @ 0x0 {
+; BOLT-NEXT:  Header {
+; BOLT-NEXT:    Length: 0xA3
+; BOLT-NEXT:    Format: DWARF32
+; BOLT-NEXT:    Version: 5
+; BOLT-NEXT:    CU count: 1
+; BOLT-NEXT:    Local TU count: 1
+; BOLT-NEXT:    Foreign TU count: 0
+; BOLT-NEXT:    Bucket count: 4
+; BOLT-NEXT:    Name count: 4
+; BOLT-NEXT:    Abbreviations table size: 0x1D
+; BOLT-NEXT:    Augmentation: 'BOLT'
+; BOLT-NEXT:  }
+; BOLT-NEXT:  Compilation Unit offsets [
+; BOLT-NEXT:    CU[0]: [[OFFSET1]]
+; BOLT-NEXT:  ]
+; BOLT-NEXT:  Local Type Unit offsets [
+; BOLT-NEXT:    LocalTU[0]: [[OFFSET]]
+; BOLT-NEXT:  ]
+; BOLT-NEXT:  Abbreviations [
+; BOLT-NEXT:    Abbreviation [[ABBREV:0x[0-9a-f]*]] {
+; BOLT-NEXT:      Tag: DW_TAG_base_type
+; BOLT-NEXT:      DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:    }
+; BOLT-NEXT:    Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+; BOLT-NEXT:      Tag: DW_TAG_structure_type
+; BOLT-NEXT:      DW_IDX_type_unit: DW_FORM_data1
+; BOLT-NEXT:      DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:    }
+; BOLT-NEXT:    Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+; BOLT-NEXT:      Tag: DW_TAG_subprogram
+; BOLT-NEXT:      DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:    }
+; BOLT-NEXT:    Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+; BOLT-NEXT:      Tag: DW_TAG_base_type
+; BOLT-NEXT:      DW_IDX_type_unit: DW_FORM_data1
+; BOLT-NEXT:      DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:    }
+; BOLT-NEXT:  ]
+; BOLT-NEXT:  Bucket 0 [
+; BOLT-NEXT:    Name 1 {
+; BOLT-NEXT:      Hash: 0xB888030
+; BOLT-NEXT:      String: {{.+}} "int"
+; BOLT-NEXT:      Entry @ {{.+}} {
+; BOLT-NEXT:        Abbrev: [[ABBREV]]
+; BOLT-NEXT:        Tag: DW_TAG_base_type
+; BOLT-NEXT:        DW_IDX_die_offset: 0x0000003f
+; BOLT-NEXT:      }
+; BOLT-NEXT:    }
+; BOLT-NEXT:    Name 2 {
+; BOLT-NEXT:      Hash: 0xF73809C
+; BOLT-NEXT:      String: {{.+}} "Foo2a"
+; BOLT-NEXT:      Entry @ {{.+}} {
+; BOLT-NEXT:        Abbrev: [[ABBREV1]]
+; BOLT-NEXT:        Tag: DW_TAG_structure_type
+; BOLT-NEXT:        DW_IDX_type_unit: 0x00
+; BOLT-NEXT:        DW_IDX_die_offset: 0x00000023
+; BOLT-NEXT:      }
+; BOLT-NEXT:    }
+; BOLT-NEXT:  ]
+; BOLT-NEXT:  Bucket 1 [
+; BOLT-NEXT:    EMPTY
+; BOLT-NEXT:  ]
+; BOLT-NEXT:  Bucket 2 [
+; BOLT-NEXT:    Name 3 {
+; BOLT-NEXT:      Hash: 0x7C9A7F6A
+; BOLT-NEXT:      String: {{.+}} "main"
+; BOLT-NEXT:      Entry @ {{.+}} {
+; BOLT-NEXT:        Abbrev: [[ABBREV2]]
+; BOLT-NEXT:        Tag: DW_TAG_subprogram
+; BOLT-NEXT:        DW_IDX_die_offset: 0x00000024
+; BOLT-NEXT:      }
+; BOLT-NEXT:    }
+; BOLT-NEXT:  ]
+; BOLT-NEXT:  Bucket 3 [
+; BOLT-NEXT:    Name 4 {
+; BOLT-NEXT:      Hash: 0x7C952063
+; BOLT-NEXT:      String: {{.+}} "char"
+; BOLT-NEXT:      Entry @ {{.+}} {
+; BOLT-NEXT:        Abbrev: [[ABBREV3]]
+; BOLT-NEXT:        Tag: DW_TAG_base_type
+; BOLT-NEXT:        DW_IDX_type_unit: 0x00
+; BOLT-NEXT:        DW_IDX_die_offset: 0x00000038
+; BOLT-NEXT:      }
+; BOLT-NEXT:    }
+; BOLT-NEXT:  ]
+; BOLT-NEXT:}
diff --git a/bolt/test/runtime/instrument-wrong-target.s b/bolt/test/runtime/X86/instrument-wrong-target.s
index 31cae734a0bf..343d93a89ed1 100644
--- a/bolt/test/runtime/instrument-wrong-target.s
+++ b/bolt/test/runtime/X86/instrument-wrong-target.s
@@ -1,7 +1,8 @@
 # Test that BOLT errs when trying to instrument a binary with a different
 # architecture than the one BOLT is built for.
 
-# REQUIRES: x86_64-linux,bolt-runtime,target=x86_64{{.*}}
+# REQUIRES: system-linux,bolt-runtime
+# REQUIRES: aarch64-registered-target
 
 # RUN: llvm-mc -triple aarch64 -filetype=obj %s -o %t.o
 # RUN: ld.lld -q -pie -o %t.exe %t.o
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp
index 370de12999ac..c633683570f7 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp
@@ -127,7 +127,8 @@ void MissingStdForwardCheck::registerMatchers(MatchFinder *Finder) {
                   hasAncestor(functionDecl().bind("func")),
                   hasAncestor(functionDecl(
                       isDefinition(), equalsBoundNode("func"), ToParam,
-                      unless(hasDescendant(std::move(ForwardCallMatcher)))))),
+                      unless(anyOf(isDeleted(), hasDescendant(std::move(
+                                                    ForwardCallMatcher))))))),
       this);
 }
 
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 69537964f9bc..3f90e7d63d6b 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -134,6 +134,10 @@ Changes in existing checks
   <clang-tidy/checks/bugprone/unused-local-non-trivial-variable>` check by
   ignoring local variable with ``[maybe_unused]`` attribute.
 
+- Improved :doc:`cppcoreguidelines-missing-std-forward
+  <clang-tidy/checks/cppcoreguidelines/missing-std-forward>` check by no longer
+  giving false positives for deleted functions.
+
 - Cleaned up :doc:`cppcoreguidelines-prefer-member-initializer
   <clang-tidy/checks/cppcoreguidelines/prefer-member-initializer>`
   by removing enforcement of rule `C.48
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp
index 443f338ba204..20e43f04180f 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp
@@ -173,3 +173,18 @@ void lambda_value_reference_auxiliary_var(T&& t) {
 }
 
 } // namespace negative_cases
+
+namespace deleted_functions {
+
+template <typename T>
+void f(T &&) = delete;
+
+struct S {
+    template <typename T>
+    S(T &&) = delete;
+
+    template <typename T>
+    void operator&(T &&) = delete;
+};
+
+} // namespace deleted_functions
diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index d509bb807679..df399a229d8d 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -1095,6 +1095,146 @@ the configuration (without a prefix: ``Auto``).
       bbb >>= 2;
 
 
+.. _AlignConsecutiveTableGenDefinitionColons:
+
+**AlignConsecutiveTableGenDefinitionColons** (``AlignConsecutiveStyle``) :versionbadge:`clang-format 19` :ref:`¶ <AlignConsecutiveTableGenDefinitionColons>`
+  Style of aligning consecutive TableGen definition colons.
+  This aligns the inheritance colons of consecutive definitions.
+
+  .. code-block:: c++
+
+    def Def       : Parent {}
+    def DefDef    : Parent {}
+    def DefDefDef : Parent {}
+
+  Nested configuration flags:
+
+  Alignment options.
+
+  They can also be read as a whole for compatibility. The choices are:
+  - None
+  - Consecutive
+  - AcrossEmptyLines
+  - AcrossComments
+  - AcrossEmptyLinesAndComments
+
+  For example, to align across empty lines and not across comments, either
+  of these work.
+
+  .. code-block:: c++
+
+    AlignConsecutiveMacros: AcrossEmptyLines
+
+    AlignConsecutiveMacros:
+      Enabled: true
+      AcrossEmptyLines: true
+      AcrossComments: false
+
+  * ``bool Enabled`` Whether aligning is enabled.
+
+    .. code-block:: c++
+
+      #define SHORT_NAME       42
+      #define LONGER_NAME      0x007f
+      #define EVEN_LONGER_NAME (2)
+      #define foo(x)           (x * x)
+      #define bar(y, z)        (y + z)
+
+      int a            = 1;
+      int somelongname = 2;
+      double c         = 3;
+
+      int aaaa : 1;
+      int b    : 12;
+      int ccc  : 8;
+
+      int         aaaa = 12;
+      float       b = 23;
+      std::string ccc;
+
+  * ``bool AcrossEmptyLines`` Whether to align across empty lines.
+
+    .. code-block:: c++
+
+      true:
+      int a            = 1;
+      int somelongname = 2;
+      double c         = 3;
+
+      int d            = 3;
+
+      false:
+      int a            = 1;
+      int somelongname = 2;
+      double c         = 3;
+
+      int d = 3;
+
+  * ``bool AcrossComments`` Whether to align across comments.
+
+    .. code-block:: c++
+
+      true:
+      int d    = 3;
+      /* A comment. */
+      double e = 4;
+
+      false:
+      int d = 3;
+      /* A comment. */
+      double e = 4;
+
+  * ``bool AlignCompound`` Only for ``AlignConsecutiveAssignments``.  Whether compound assignments
+    like ``+=`` are aligned along with ``=``.
+
+    .. code-block:: c++
+
+      true:
+      a   &= 2;
+      bbb  = 2;
+
+      false:
+      a &= 2;
+      bbb = 2;
+
+  * ``bool AlignFunctionPointers`` Only for ``AlignConsecutiveDeclarations``. Whether function pointers are
+    aligned.
+
+    .. code-block:: c++
+
+      true:
+      unsigned i;
+      int     &r;
+      int     *p;
+      int      (*f)();
+
+      false:
+      unsigned i;
+      int     &r;
+      int     *p;
+      int (*f)();
+
+  * ``bool PadOperators`` Only for ``AlignConsecutiveAssignments``.  Whether short assignment
+    operators are left-padded to the same length as long ones in order to
+    put all assignment operators to the right of the left hand side.
+
+    .. code-block:: c++
+
+      true:
+      a   >>= 2;
+      bbb   = 2;
+
+      a     = 2;
+      bbb >>= 2;
+
+      false:
+      a >>= 2;
+      bbb = 2;
+
+      a     = 2;
+      bbb >>= 2;
+
+
 .. _AlignEscapedNewlines:
 
 **AlignEscapedNewlines** (``EscapedNewlineAlignmentStyle``) :versionbadge:`clang-format 5` :ref:`¶ <AlignEscapedNewlines>`
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 711baf45f449..2a177814c4df 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -3473,6 +3473,37 @@ builtin, the mangler emits their usual pattern without any special treatment.
   // Computes a unique stable name for the given type.
   constexpr const char * __builtin_sycl_unique_stable_name( type-id );
 
+``__builtin_popcountg``
+-----------------------
+
+``__builtin_popcountg`` returns the number of 1 bits in the argument. The
+argument can be of any integer type.
+
+**Syntax**:
+
+.. code-block:: c++
+
+  int __builtin_popcountg(type x)
+
+**Examples**:
+
+.. code-block:: c++
+
+  int x = 1;
+  int x_pop = __builtin_popcountg(x);
+
+  unsigned long y = 3;
+  int y_pop = __builtin_popcountg(y);
+
+  _BitInt(128) z = 7;
+  int z_pop = __builtin_popcountg(z);
+
+**Description**:
+
+``__builtin_popcountg`` is meant to be a type-generic alternative to the
+``__builtin_popcount{,l,ll}`` builtins, with support for other integer types,
+such as ``__int128`` and C23 ``_BitInt(N)``.
+
 Multiprecision Arithmetic Builtins
 ----------------------------------
 
diff --git a/clang/docs/LibASTMatchersReference.html b/clang/docs/LibASTMatchersReference.html
index c40d679e383b..8a06084955aa 100644
--- a/clang/docs/LibASTMatchersReference.html
+++ b/clang/docs/LibASTMatchersReference.html
@@ -7049,7 +7049,7 @@ binary operator or fold expression matches.
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXFoldExpr.html">CXXFoldExpr</a>&gt;</td><td class="name" onclick="toggle('hasFoldInit0')"><a name="hasFoldInit0Anchor">hasFoldInit</a></td><td>ast_matchers::Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMacher</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXFoldExpr.html">CXXFoldExpr</a>&gt;</td><td class="name" onclick="toggle('hasFoldInit0')"><a name="hasFoldInit0Anchor">hasFoldInit</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMacher</td></tr>
 <tr><td colspan="4" class="doc" id="hasFoldInit0"><pre>Matches the operand that does not contain the parameter pack.
 
 Example matches `(0 + ... + args)` and `(args * ... * 1)`
@@ -7089,7 +7089,7 @@ Example matcher = binaryOperator(hasOperands(integerLiteral(equals(1),
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXFoldExpr.html">CXXFoldExpr</a>&gt;</td><td class="name" onclick="toggle('hasPattern0')"><a name="hasPattern0Anchor">hasPattern</a></td><td>ast_matchers::Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMacher</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXFoldExpr.html">CXXFoldExpr</a>&gt;</td><td class="name" onclick="toggle('hasPattern0')"><a name="hasPattern0Anchor">hasPattern</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMacher</td></tr>
 <tr><td colspan="4" class="doc" id="hasPattern0"><pre>Matches the operand that contains the parameter pack.
 
 Example matches `(0 + ... + args)`
@@ -7859,7 +7859,7 @@ int a = b ?: 1;
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1ClassTemplateSpecializationDecl.html">ClassTemplateSpecializationDecl</a>&gt;</td><td class="name" onclick="toggle('forEachTemplateArgument0')"><a name="forEachTemplateArgument0Anchor">forEachTemplateArgument</a></td><td>clang::ast_matchers::Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1ClassTemplateSpecializationDecl.html">ClassTemplateSpecializationDecl</a>&gt;</td><td class="name" onclick="toggle('forEachTemplateArgument0')"><a name="forEachTemplateArgument0Anchor">forEachTemplateArgument</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="forEachTemplateArgument0"><pre>Matches classTemplateSpecialization, templateSpecializationType and
 functionDecl nodes where the template argument matches the inner matcher.
 This matcher may produce multiple matches.
@@ -8454,7 +8454,7 @@ Example matches x (matcher = expr(hasType(cxxRecordDecl(hasName("X")))))
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;</td><td class="name" onclick="toggle('ignoringElidableConstructorCall0')"><a name="ignoringElidableConstructorCall0Anchor">ignoringElidableConstructorCall</a></td><td>ast_matchers::Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;</td><td class="name" onclick="toggle('ignoringElidableConstructorCall0')"><a name="ignoringElidableConstructorCall0Anchor">ignoringElidableConstructorCall</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="ignoringElidableConstructorCall0"><pre>Matches expressions that match InnerMatcher that are possibly wrapped in an
 elidable constructor and other corresponding bookkeeping nodes.
 
@@ -8691,7 +8691,7 @@ Example matches x (matcher = expr(hasType(cxxRecordDecl(hasName("X")))))
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('forEachTemplateArgument2')"><a name="forEachTemplateArgument2Anchor">forEachTemplateArgument</a></td><td>clang::ast_matchers::Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('forEachTemplateArgument2')"><a name="forEachTemplateArgument2Anchor">forEachTemplateArgument</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="forEachTemplateArgument2"><pre>Matches classTemplateSpecialization, templateSpecializationType and
 functionDecl nodes where the template argument matches the inner matcher.
 This matcher may produce multiple matches.
@@ -8959,7 +8959,7 @@ matcher.
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1InitListExpr.html">InitListExpr</a>&gt;</td><td class="name" onclick="toggle('hasInit0')"><a name="hasInit0Anchor">hasInit</a></td><td>unsigned N, ast_matchers::Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1InitListExpr.html">InitListExpr</a>&gt;</td><td class="name" onclick="toggle('hasInit0')"><a name="hasInit0Anchor">hasInit</a></td><td>unsigned N, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasInit0"><pre>Matches the n'th item of an initializer list expression.
 
 Example matches y.
@@ -10026,7 +10026,7 @@ varDecl(hasTypeLoc(templateSpecializationTypeLoc(hasTemplateArgumentLoc(0,
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;</td><td class="name" onclick="toggle('forEachTemplateArgument1')"><a name="forEachTemplateArgument1Anchor">forEachTemplateArgument</a></td><td>clang::ast_matchers::Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;</td><td class="name" onclick="toggle('forEachTemplateArgument1')"><a name="forEachTemplateArgument1Anchor">forEachTemplateArgument</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="forEachTemplateArgument1"><pre>Matches classTemplateSpecialization, templateSpecializationType and
 functionDecl nodes where the template argument matches the inner matcher.
 This matcher may produce multiple matches.
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 9e67bbb78950..7e16b9f0c67d 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -197,6 +197,9 @@ Improvements to Clang's time-trace
 
 Bug Fixes in This Version
 -------------------------
+- Fixed missing warnings when comparing mismatched enumeration constants
+  in C (`#29217 <https://github.com/llvm/llvm-project/issues/29217>`).
+
 - Clang now accepts elaborated-type-specifiers that explicitly specialize
   a member class template for an implicit instantiation of a class template.
 
@@ -283,6 +286,10 @@ Bug Fixes to C++ Support
   (`#78524 <https://github.com/llvm/llvm-project/issues/78524>`_)
 - Clang no longer instantiates the exception specification of discarded candidate function
   templates when determining the primary template of an explicit specialization.
+- Fixed a crash in Microsoft compatibility mode where unqualified dependent base class
+  lookup searches the bases of an incomplete class.
+- Fix a crash when an unresolved overload set is encountered on the RHS of a ``.*`` operator.
+  (`#53815 <https://github.com/llvm/llvm-project/issues/53815>`_)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/docs/tools/dump_ast_matchers.py b/clang/docs/tools/dump_ast_matchers.py
index cc7024d1627b..705ff0d4d409 100755
--- a/clang/docs/tools/dump_ast_matchers.py
+++ b/clang/docs/tools/dump_ast_matchers.py
@@ -116,6 +116,8 @@ def strip_doxygen(comment):
 
 def unify_arguments(args):
     """Gets rid of anything the user doesn't care about in the argument list."""
+    args = re.sub(r"clang::ast_matchers::internal::", r"", args)
+    args = re.sub(r"ast_matchers::internal::", r"", args)
     args = re.sub(r"internal::", r"", args)
     args = re.sub(r"extern const\s+(.*)&", r"\1 ", args)
     args = re.sub(r"&", r" ", args)
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 12ce9af1e53f..ff6b64c7f72d 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -2981,6 +2981,10 @@ public:
   // corresponding saturated type for a given fixed point type.
   QualType getCorrespondingSaturatedType(QualType Ty) const;
 
+  // Per ISO N1169, this method accepts fixed point types and returns the
+  // corresponding non-saturated type for a given fixed point type.
+  QualType getCorrespondingUnsaturatedType(QualType Ty) const;
+
   // This method accepts fixed point types and returns the corresponding signed
   // type. Unlike getCorrespondingUnsignedType(), this only accepts unsigned
   // fixed point types because there are unsigned integer types like bool and
diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
index 3fc481a62a78..bf0622bdeca3 100644
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -153,6 +153,12 @@ public:
     TR = t;
   }
 
+  /// If this expression is an enumeration constant, return the
+  /// enumeration type under which said constant was declared.
+  /// Otherwise return the expression's type.
+  /// Note this effectively circumvents the weak typing of C's enum constants
+  QualType getEnumCoercedType(const ASTContext &Ctx) const;
+
   ExprDependence getDependence() const {
     return static_cast<ExprDependence>(ExprBits.Dependent);
   }
@@ -471,6 +477,13 @@ public:
   /// bit-fields, but it will return null for a conditional bit-field.
   FieldDecl *getSourceBitField();
 
+  /// If this expression refers to an enum constant, retrieve its declaration
+  EnumConstantDecl *getEnumConstantDecl();
+
+  const EnumConstantDecl *getEnumConstantDecl() const {
+    return const_cast<Expr *>(this)->getEnumConstantDecl();
+  }
+
   const FieldDecl *getSourceBitField() const {
     return const_cast<Expr*>(this)->getSourceBitField();
   }
diff --git a/clang/include/clang/AST/FormatString.h b/clang/include/clang/AST/FormatString.h
index 5c4ad9baaef6..e2232fb4a471 100644
--- a/clang/include/clang/AST/FormatString.h
+++ b/clang/include/clang/AST/FormatString.h
@@ -171,6 +171,14 @@ public:
 
     ZArg, // MS extension
 
+    // ISO/IEC TR 18037 (fixed-point) specific specifiers.
+    kArg, // %k for signed accum types
+    KArg, // %K for unsigned accum types
+    rArg, // %r for signed fract types
+    RArg, // %R for unsigned fract types
+    FixedPointArgBeg = kArg,
+    FixedPointArgEnd = RArg,
+
     // Objective-C specific specifiers.
     ObjCObjArg, // '@'
     ObjCBeg = ObjCObjArg,
@@ -237,6 +245,9 @@ public:
   bool isDoubleArg() const {
     return kind >= DoubleArgBeg && kind <= DoubleArgEnd;
   }
+  bool isFixedPointArg() const {
+    return kind >= FixedPointArgBeg && kind <= FixedPointArgEnd;
+  }
 
   const char *toString() const;
 
diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
index dc1f49525a00..ced89ff127ab 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchers.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
@@ -4580,8 +4580,7 @@ AST_POLYMORPHIC_MATCHER_P2(hasArgument,
 ///       return (args * ... * 1);
 ///   }
 /// \endcode
-AST_MATCHER_P(CXXFoldExpr, hasFoldInit, ast_matchers::internal::Matcher<Expr>,
-              InnerMacher) {
+AST_MATCHER_P(CXXFoldExpr, hasFoldInit, internal::Matcher<Expr>, InnerMacher) {
   const auto *const Init = Node.getInit();
   return Init && InnerMacher.matches(*Init, Finder, Builder);
 }
@@ -4603,8 +4602,7 @@ AST_MATCHER_P(CXXFoldExpr, hasFoldInit, ast_matchers::internal::Matcher<Expr>,
 ///       return (args * ... * 1);
 ///   }
 /// \endcode
-AST_MATCHER_P(CXXFoldExpr, hasPattern, ast_matchers::internal::Matcher<Expr>,
-              InnerMacher) {
+AST_MATCHER_P(CXXFoldExpr, hasPattern, internal::Matcher<Expr>, InnerMacher) {
   const Expr *const Pattern = Node.getPattern();
   return Pattern && InnerMacher.matches(*Pattern, Finder, Builder);
 }
@@ -4685,8 +4683,8 @@ AST_MATCHER(CXXFoldExpr, isBinaryFold) { return Node.getInit() != nullptr; }
 /// \code
 ///   int x{y}.
 /// \endcode
-AST_MATCHER_P2(InitListExpr, hasInit, unsigned, N,
-               ast_matchers::internal::Matcher<Expr>, InnerMatcher) {
+AST_MATCHER_P2(InitListExpr, hasInit, unsigned, N, internal::Matcher<Expr>,
+               InnerMatcher) {
   return N < Node.getNumInits() &&
           InnerMatcher.matches(*Node.getInit(N), Finder, Builder);
 }
@@ -5309,7 +5307,7 @@ AST_POLYMORPHIC_MATCHER_P(
     forEachTemplateArgument,
     AST_POLYMORPHIC_SUPPORTED_TYPES(ClassTemplateSpecializationDecl,
                                     TemplateSpecializationType, FunctionDecl),
-    clang::ast_matchers::internal::Matcher<TemplateArgument>, InnerMatcher) {
+    internal::Matcher<TemplateArgument>, InnerMatcher) {
   ArrayRef<TemplateArgument> TemplateArgs =
       clang::ast_matchers::internal::getTemplateSpecializationArgs(Node);
   clang::ast_matchers::internal::BoundNodesTreeBuilder Result;
@@ -8525,8 +8523,8 @@ AST_MATCHER(FunctionDecl, hasTrailingReturn) {
 ///
 /// ``varDecl(hasInitializer(ignoringElidableConstructorCall(callExpr())))``
 /// matches ``H D = G()`` in C++11 through C++17 (and beyond).
-AST_MATCHER_P(Expr, ignoringElidableConstructorCall,
-              ast_matchers::internal::Matcher<Expr>, InnerMatcher) {
+AST_MATCHER_P(Expr, ignoringElidableConstructorCall, internal::Matcher<Expr>,
+              InnerMatcher) {
   // E tracks the node that we are examining.
   const Expr *E = &Node;
   // If present, remove an outer `ExprWithCleanups` corresponding to the
diff --git a/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h b/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h
index 13e37ac2b56b..7bdb9052e57e 100644
--- a/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h
+++ b/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h
@@ -527,8 +527,10 @@ private:
   BlockInfo *CurrentBlockInfo = nullptr;
 };
 
+#ifndef NDEBUG
 // Dump an SCFG to llvm::errs().
 void printSCFG(CFGWalker &Walker);
+#endif // NDEBUG
 
 } // namespace threadSafety
 } // namespace clang
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
index b95095d2184c..3c84704d0d6c 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
@@ -54,10 +54,9 @@ namespace dataflow {
 ///                         Environment &Env)` - applies the analysis transfer
 ///    function for a given edge from a CFG block of a conditional statement.
 ///
-///  `Derived` can optionally override the following members:
-///   * `bool merge(QualType, const Value &, const Value &, Value &,
-///     Environment &)` -  joins distinct values. This could be a strict
-///     lattice join or a more general widening operation.
+///  `Derived` can optionally override the virtual functions in the
+///  `Environment::ValueModel` interface (which is an indirect base class of
+///  this class).
 ///
 ///  `LatticeT` is a bounded join-semilattice that is used by `Derived` and must
 ///  provide the following public members:
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
index 0aecc749bf41..7f8c70d16937 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
@@ -79,32 +79,6 @@ public:
       return ComparisonResult::Unknown;
     }
 
-    /// DEPRECATED. Override `join` and/or `widen`, instead.
-    ///
-    /// Modifies `MergedVal` to approximate both `Val1` and `Val2`. This could
-    /// be a strict lattice join or a more general widening operation.
-    ///
-    /// If this function returns true, `MergedVal` will be assigned to a storage
-    /// location of type `Type` in `MergedEnv`.
-    ///
-    /// `Env1` and `Env2` can be used to query child values and path condition
-    /// implications of `Val1` and `Val2` respectively.
-    ///
-    /// Requirements:
-    ///
-    ///  `Val1` and `Val2` must be distinct.
-    ///
-    ///  `Val1`, `Val2`, and `MergedVal` must model values of type `Type`.
-    ///
-    ///  `Val1` and `Val2` must be assigned to the same storage location in
-    ///  `Env1` and `Env2` respectively.
-    virtual bool merge(QualType Type, const Value &Val1,
-                       const Environment &Env1, const Value &Val2,
-                       const Environment &Env2, Value &MergedVal,
-                       Environment &MergedEnv) {
-      return true;
-    }
-
     /// Modifies `JoinedVal` to approximate both `Val1` and `Val2`. This should
     /// obey the properties of a lattice join.
     ///
@@ -121,11 +95,7 @@ public:
     ///  `Env1` and `Env2` respectively.
     virtual void join(QualType Type, const Value &Val1, const Environment &Env1,
                       const Value &Val2, const Environment &Env2,
-                      Value &JoinedVal, Environment &JoinedEnv) {
-      [[maybe_unused]] bool ShouldKeep =
-          merge(Type, Val1, Env1, Val2, Env2, JoinedVal, JoinedEnv);
-      assert(ShouldKeep && "dropping merged value is unsupported");
-    }
+                      Value &JoinedVal, Environment &JoinedEnv) {}
 
     /// This function may widen the current value -- replace it with an
     /// approximation that can reach a fixed point more quickly than iterated
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index e3432f7925ba..3bc35c5bb38e 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -688,6 +688,12 @@ def Popcount : Builtin, BitInt_Long_LongLongTemplate {
   let Prototype = "int(unsigned T)";
 }
 
+def Popcountg : Builtin {
+  let Spellings = ["__builtin_popcountg"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "int(...)";
+}
+
 def Clrsb : Builtin, BitInt_Long_LongLongTemplate {
   let Spellings = ["__builtin_clrsb"];
   let Attributes = [NoThrow, Const, Constexpr];
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 57784a4ba2e3..c8141fefb8ed 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -11983,7 +11983,8 @@ def err_builtin_invalid_arg_type: Error <
   "pointer to a valid matrix element type|"
   "signed integer or floating point type|vector type|"
   "floating point type|"
-  "vector of integers}1 (was %2)">;
+  "vector of integers|"
+  "type of integer}1 (was %2)">;
 
 def err_builtin_matrix_disabled: Error<
   "matrix types extension is disabled. Pass -fenable-matrix to enable it">;
diff --git a/clang/include/clang/Basic/TargetOSMacros.def b/clang/include/clang/Basic/TargetOSMacros.def
index dfc2e033f6fd..58dce330f9c8 100644
--- a/clang/include/clang/Basic/TargetOSMacros.def
+++ b/clang/include/clang/Basic/TargetOSMacros.def
@@ -34,18 +34,19 @@ TARGET_OS(TARGET_OS_UNIX, Triple.isOSNetBSD() ||
 TARGET_OS(TARGET_OS_MAC, Triple.isOSDarwin())
 TARGET_OS(TARGET_OS_OSX, Triple.isMacOSX())
 TARGET_OS(TARGET_OS_IPHONE, Triple.isiOS() || Triple.isTvOS() ||
-                            Triple.isWatchOS())
+                            Triple.isWatchOS() || Triple.isXROS())
 // Triple::isiOS() also includes tvOS
 TARGET_OS(TARGET_OS_IOS, Triple.getOS() == llvm::Triple::IOS)
 TARGET_OS(TARGET_OS_TV, Triple.isTvOS())
 TARGET_OS(TARGET_OS_WATCH, Triple.isWatchOS())
+TARGET_OS(TARGET_OS_VISION, Triple.isXROS())
 TARGET_OS(TARGET_OS_DRIVERKIT, Triple.isDriverKit())
 TARGET_OS(TARGET_OS_MACCATALYST, Triple.isMacCatalystEnvironment())
 TARGET_OS(TARGET_OS_SIMULATOR, Triple.isSimulatorEnvironment())
 
 // Deprecated Apple target conditionals.
 TARGET_OS(TARGET_OS_EMBEDDED, (Triple.isiOS() || Triple.isTvOS() \
-                               || Triple.isWatchOS()) \
+                               || Triple.isWatchOS() || Triple.isXROS()) \
                                && !Triple.isMacCatalystEnvironment() \
                                && !Triple.isSimulatorEnvironment())
 TARGET_OS(TARGET_OS_NANO, Triple.isWatchOS())
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 449ce9e53be1..613f1fd16846 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -424,6 +424,16 @@ struct FormatStyle {
   /// \version 19
   AlignConsecutiveStyle AlignConsecutiveTableGenCondOperatorColons;
 
+  /// Style of aligning consecutive TableGen definition colons.
+  /// This aligns the inheritance colons of consecutive definitions.
+  /// \code
+  ///   def Def       : Parent {}
+  ///   def DefDef    : Parent {}
+  ///   def DefDefDef : Parent {}
+  /// \endcode
+  /// \version 19
+  AlignConsecutiveStyle AlignConsecutiveTableGenDefinitionColons;
+
   /// Different styles for aligning escaped newlines.
   enum EscapedNewlineAlignmentStyle : int8_t {
     /// Don't align escaped newlines.
@@ -4817,6 +4827,8 @@ struct FormatStyle {
                R.AlignConsecutiveShortCaseStatements &&
            AlignConsecutiveTableGenCondOperatorColons ==
                R.AlignConsecutiveTableGenCondOperatorColons &&
+           AlignConsecutiveTableGenDefinitionColons ==
+               R.AlignConsecutiveTableGenDefinitionColons &&
            AlignEscapedNewlines == R.AlignEscapedNewlines &&
            AlignOperands == R.AlignOperands &&
            AlignTrailingComments == R.AlignTrailingComments &&
diff --git a/clang/include/clang/InstallAPI/Context.h b/clang/include/clang/InstallAPI/Context.h
index 7d105920734f..3e2046642c7f 100644
--- a/clang/include/clang/InstallAPI/Context.h
+++ b/clang/include/clang/InstallAPI/Context.h
@@ -9,6 +9,9 @@
 #ifndef LLVM_CLANG_INSTALLAPI_CONTEXT_H
 #define LLVM_CLANG_INSTALLAPI_CONTEXT_H
 
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/FileManager.h"
+#include "clang/InstallAPI/HeaderFile.h"
 #include "llvm/TextAPI/InterfaceFile.h"
 #include "llvm/TextAPI/RecordVisitor.h"
 #include "llvm/TextAPI/RecordsSlice.h"
@@ -24,8 +27,23 @@ struct InstallAPIContext {
   /// Library attributes that are typically passed as linker inputs.
   llvm::MachO::RecordsSlice::BinaryAttrs BA;
 
-  /// Active target triple to parse.
-  llvm::Triple TargetTriple{};
+  /// All headers that represent a library.
+  HeaderSeq InputHeaders;
+
+  /// Active language mode to parse in.
+  Language LangMode = Language::ObjC;
+
+  /// Active header access type.
+  HeaderType Type = HeaderType::Unknown;
+
+  /// Active TargetSlice for symbol record collection.
+  std::shared_ptr<llvm::MachO::RecordsSlice> Slice;
+
+  /// FileManager for all I/O operations.
+  FileManager *FM = nullptr;
+
+  /// DiagnosticsEngine for all error reporting.
+  DiagnosticsEngine *Diags = nullptr;
 
   /// File Path of output location.
   llvm::StringRef OutputLoc{};
diff --git a/clang/include/clang/InstallAPI/Frontend.h b/clang/include/clang/InstallAPI/Frontend.h
new file mode 100644
index 000000000000..7ee87ae028d0
--- /dev/null
+++ b/clang/include/clang/InstallAPI/Frontend.h
@@ -0,0 +1,48 @@
+//===- InstallAPI/Frontend.h -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Top level wrappers for InstallAPI frontend operations.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_INSTALLAPI_FRONTEND_H
+#define LLVM_CLANG_INSTALLAPI_FRONTEND_H
+
+#include "clang/AST/ASTConsumer.h"
+#include "clang/Frontend/CompilerInstance.h"
+#include "clang/Frontend/FrontendActions.h"
+#include "clang/InstallAPI/Context.h"
+#include "clang/InstallAPI/Visitor.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+namespace clang {
+namespace installapi {
+
+/// Create a buffer that contains all headers to scan
+/// for global symbols with.
+std::unique_ptr<llvm::MemoryBuffer>
+createInputBuffer(const InstallAPIContext &Ctx);
+
+class InstallAPIAction : public ASTFrontendAction {
+public:
+  explicit InstallAPIAction(llvm::MachO::RecordsSlice &Records)
+      : Records(Records) {}
+
+  std::unique_ptr<ASTConsumer> CreateASTConsumer(CompilerInstance &CI,
+                                                 StringRef InFile) override {
+    return std::make_unique<InstallAPIVisitor>(CI.getASTContext(), Records);
+  }
+
+private:
+  llvm::MachO::RecordsSlice &Records;
+};
+} // namespace installapi
+} // namespace clang
+
+#endif // LLVM_CLANG_INSTALLAPI_FRONTEND_H
diff --git a/clang/include/clang/InstallAPI/HeaderFile.h b/clang/include/clang/InstallAPI/HeaderFile.h
index fc64a43b3def..70e83bbb3e76 100644
--- a/clang/include/clang/InstallAPI/HeaderFile.h
+++ b/clang/include/clang/InstallAPI/HeaderFile.h
@@ -15,6 +15,7 @@
 
 #include "clang/Basic/LangStandard.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Regex.h"
 #include <optional>
 #include <string>
@@ -32,6 +33,20 @@ enum class HeaderType {
   Project,
 };
 
+inline StringRef getName(const HeaderType T) {
+  switch (T) {
+  case HeaderType::Public:
+    return "Public";
+  case HeaderType::Private:
+    return "Private";
+  case HeaderType::Project:
+    return "Project";
+  case HeaderType::Unknown:
+    return "Unknown";
+  }
+  llvm_unreachable("unexpected header type");
+}
+
 class HeaderFile {
   /// Full input path to header.
   std::string FullPath;
@@ -52,6 +67,14 @@ public:
 
   static llvm::Regex getFrameworkIncludeRule();
 
+  HeaderType getType() const { return Type; }
+  StringRef getIncludeName() const { return IncludeName; }
+  StringRef getPath() const { return FullPath; }
+
+  bool useIncludeName() const {
+    return Type != HeaderType::Project && !IncludeName.empty();
+  }
+
   bool operator==(const HeaderFile &Other) const {
     return std::tie(Type, FullPath, IncludeName, Language) ==
            std::tie(Other.Type, Other.FullPath, Other.IncludeName,
diff --git a/clang/include/clang/InstallAPI/Visitor.h b/clang/include/clang/InstallAPI/Visitor.h
new file mode 100644
index 000000000000..95d669688e4f
--- /dev/null
+++ b/clang/include/clang/InstallAPI/Visitor.h
@@ -0,0 +1,51 @@
+//===- InstallAPI/Visitor.h -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// ASTVisitor Interface for InstallAPI frontend operations.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_INSTALLAPI_VISITOR_H
+#define LLVM_CLANG_INSTALLAPI_VISITOR_H
+
+#include "clang/AST/Mangle.h"
+#include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/Basic/TargetInfo.h"
+#include "clang/Frontend/FrontendActions.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/TextAPI/RecordsSlice.h"
+
+namespace clang {
+namespace installapi {
+
+/// ASTVisitor for collecting declarations that represent global symbols.
+class InstallAPIVisitor final : public ASTConsumer,
+                                public RecursiveASTVisitor<InstallAPIVisitor> {
+public:
+  InstallAPIVisitor(ASTContext &ASTCtx, llvm::MachO::RecordsSlice &Slice)
+      : Slice(Slice),
+        MC(ItaniumMangleContext::create(ASTCtx, ASTCtx.getDiagnostics())),
+        Layout(ASTCtx.getTargetInfo().getDataLayoutString()) {}
+  void HandleTranslationUnit(ASTContext &ASTCtx) override;
+
+  /// Collect global variables.
+  bool VisitVarDecl(const VarDecl *D);
+
+private:
+  std::string getMangledName(const NamedDecl *D) const;
+  std::string getBackendMangledName(llvm::Twine Name) const;
+
+  llvm::MachO::RecordsSlice &Slice;
+  std::unique_ptr<clang::ItaniumMangleContext> MC;
+  StringRef Layout;
+};
+
+} // namespace installapi
+} // namespace clang
+
+#endif // LLVM_CLANG_INSTALLAPI_VISITOR_H
diff --git a/clang/lib/APINotes/APINotesReader.cpp b/clang/lib/APINotes/APINotesReader.cpp
index ff9b95d9bf75..55ea4bae81e6 100644
--- a/clang/lib/APINotes/APINotesReader.cpp
+++ b/clang/lib/APINotes/APINotesReader.cpp
@@ -81,9 +81,9 @@ public:
       auto version = ReadVersionTuple(Data);
       const auto *DataBefore = Data;
       (void)DataBefore;
+      auto UnversionedData = Derived::readUnversioned(Key, Data);
       assert(Data != DataBefore &&
              "Unversioned data reader didn't move pointer");
-      auto UnversionedData = Derived::readUnversioned(Key, Data);
       Result.push_back({version, UnversionedData});
     }
     return Result;
@@ -148,7 +148,7 @@ public:
   external_key_type GetExternalKey(internal_key_type Key) { return Key; }
 
   hash_value_type ComputeHash(internal_key_type Key) {
-    return llvm::hash_value(Key);
+    return llvm::djbHash(Key);
   }
 
   static bool EqualKey(internal_key_type LHS, internal_key_type RHS) {
@@ -1797,8 +1797,8 @@ APINotesReader::Create(std::unique_ptr<llvm::MemoryBuffer> InputBuffer,
 template <typename T>
 APINotesReader::VersionedInfo<T>::VersionedInfo(
     llvm::VersionTuple Version,
-    llvm::SmallVector<std::pair<llvm::VersionTuple, T>, 1> Results)
-    : Results(std::move(Results)) {
+    llvm::SmallVector<std::pair<llvm::VersionTuple, T>, 1> R)
+    : Results(std::move(R)) {
 
   assert(!Results.empty());
   assert(std::is_sorted(
diff --git a/clang/lib/APINotes/APINotesWriter.cpp b/clang/lib/APINotes/APINotesWriter.cpp
index 62a2ab179991..76fd24ccfae9 100644
--- a/clang/lib/APINotes/APINotesWriter.cpp
+++ b/clang/lib/APINotes/APINotesWriter.cpp
@@ -128,6 +128,7 @@ class APINotesWriter::Implementation {
   SelectorID getSelector(ObjCSelectorRef SelectorRef) {
     // Translate the selector reference into a stored selector.
     StoredObjCSelector Selector;
+    Selector.NumArgs = SelectorRef.NumArgs;
     Selector.Identifiers.reserve(SelectorRef.Identifiers.size());
     for (auto piece : SelectorRef.Identifiers)
       Selector.Identifiers.push_back(getIdentifier(piece));
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index c475c841233c..5a8fae76a43a 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -13314,6 +13314,42 @@ QualType ASTContext::getCommonSugaredType(QualType X, QualType Y,
   return R;
 }
 
+QualType ASTContext::getCorrespondingUnsaturatedType(QualType Ty) const {
+  assert(Ty->isFixedPointType());
+
+  if (Ty->isUnsaturatedFixedPointType())
+    return Ty;
+
+  switch (Ty->castAs<BuiltinType>()->getKind()) {
+  default:
+    llvm_unreachable("Not a saturated fixed point type!");
+  case BuiltinType::SatShortAccum:
+    return ShortAccumTy;
+  case BuiltinType::SatAccum:
+    return AccumTy;
+  case BuiltinType::SatLongAccum:
+    return LongAccumTy;
+  case BuiltinType::SatUShortAccum:
+    return UnsignedShortAccumTy;
+  case BuiltinType::SatUAccum:
+    return UnsignedAccumTy;
+  case BuiltinType::SatULongAccum:
+    return UnsignedLongAccumTy;
+  case BuiltinType::SatShortFract:
+    return ShortFractTy;
+  case BuiltinType::SatFract:
+    return FractTy;
+  case BuiltinType::SatLongFract:
+    return LongFractTy;
+  case BuiltinType::SatUShortFract:
+    return UnsignedShortFractTy;
+  case BuiltinType::SatUFract:
+    return UnsignedFractTy;
+  case BuiltinType::SatULongFract:
+    return UnsignedLongFractTy;
+  }
+}
+
 QualType ASTContext::getCorrespondingSaturatedType(QualType Ty) const {
   assert(Ty->isFixedPointType());
 
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index cc0407131d79..b4de2155adce 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -263,6 +263,14 @@ namespace {
   }
 }
 
+QualType Expr::getEnumCoercedType(const ASTContext &Ctx) const {
+  if (isa<EnumType>(this->getType()))
+    return this->getType();
+  else if (const auto *ECD = this->getEnumConstantDecl())
+    return Ctx.getTypeDeclType(cast<EnumDecl>(ECD->getDeclContext()));
+  return this->getType();
+}
+
 SourceLocation Expr::getExprLoc() const {
   switch (getStmtClass()) {
   case Stmt::NoStmtClass: llvm_unreachable("statement without class");
@@ -4098,6 +4106,13 @@ FieldDecl *Expr::getSourceBitField() {
   return nullptr;
 }
 
+EnumConstantDecl *Expr::getEnumConstantDecl() {
+  Expr *E = this->IgnoreParenImpCasts();
+  if (auto *DRE = dyn_cast<DeclRefExpr>(E))
+    return dyn_cast<EnumConstantDecl>(DRE->getDecl());
+  return nullptr;
+}
+
 bool Expr::refersToVectorElement() const {
   // FIXME: Why do we not just look at the ObjectKind here?
   const Expr *E = this->IgnoreParens();
diff --git a/clang/lib/AST/FormatString.cpp b/clang/lib/AST/FormatString.cpp
index c5d14b4af7ff..0c80ad109ccb 100644
--- a/clang/lib/AST/FormatString.cpp
+++ b/clang/lib/AST/FormatString.cpp
@@ -403,6 +403,10 @@ ArgType::matchesType(ASTContext &C, QualType argTy) const {
         else if (ETy->isUnscopedEnumerationType())
           argTy = ETy->getDecl()->getIntegerType();
       }
+
+      if (argTy->isSaturatedFixedPointType())
+        argTy = C.getCorrespondingUnsaturatedType(argTy);
+
       argTy = C.getCanonicalType(argTy).getUnqualifiedType();
 
       if (T == argTy)
@@ -761,6 +765,16 @@ const char *ConversionSpecifier::toString() const {
 
   // MS specific specifiers.
   case ZArg: return "Z";
+
+  // ISO/IEC TR 18037 (fixed-point) specific specifiers.
+  case rArg:
+    return "r";
+  case RArg:
+    return "R";
+  case kArg:
+    return "k";
+  case KArg:
+    return "K";
   }
   return nullptr;
 }
@@ -825,6 +839,9 @@ bool FormatSpecifier::hasValidLengthModifier(const TargetInfo &Target,
       if (LO.OpenCL && CS.isDoubleArg())
         return !VectorNumElts.isInvalid();
 
+      if (CS.isFixedPointArg())
+        return true;
+
       if (Target.getTriple().isOSMSVCRT()) {
         switch (CS.getKind()) {
           case ConversionSpecifier::cArg:
@@ -877,6 +894,9 @@ bool FormatSpecifier::hasValidLengthModifier(const TargetInfo &Target,
         return true;
       }
 
+      if (CS.isFixedPointArg())
+        return true;
+
       switch (CS.getKind()) {
         case ConversionSpecifier::bArg:
         case ConversionSpecifier::BArg:
@@ -1043,6 +1063,11 @@ bool FormatSpecifier::hasStandardConversionSpecifier(
     case ConversionSpecifier::UArg:
     case ConversionSpecifier::ZArg:
       return false;
+    case ConversionSpecifier::rArg:
+    case ConversionSpecifier::RArg:
+    case ConversionSpecifier::kArg:
+    case ConversionSpecifier::KArg:
+      return LangOpt.FixedPoint;
   }
   llvm_unreachable("Invalid ConversionSpecifier Kind!");
 }
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index a71b6e82817e..b151f8d0d7a7 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -897,7 +897,7 @@ bool ByteCodeExprGen<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
         if (!this->visitInitializer(Init))
           return false;
 
-        if (!this->emitInitPtrPop(E))
+        if (!this->emitFinishInitPop(E))
           return false;
         // Base initializers don't increase InitIndex, since they don't count
         // into the Record's fields.
@@ -940,7 +940,7 @@ bool ByteCodeExprGen<Emitter>::visitArrayElemInit(unsigned ElemIndex,
     return false;
   if (!this->visitInitializer(Init))
     return false;
-  return this->emitInitPtrPop(Init);
+  return this->emitFinishInitPop(Init);
 }
 
 template <class Emitter>
@@ -1700,19 +1700,35 @@ bool ByteCodeExprGen<Emitter>::VisitCompoundLiteralExpr(
   }
 
   // Otherwise, use a local variable.
-  if (T) {
+  if (T && !E->isLValue()) {
     // For primitive types, we just visit the initializer.
     return this->delegate(Init);
   } else {
-    if (std::optional<unsigned> LocalIndex = allocateLocal(Init)) {
-      if (!this->emitGetPtrLocal(*LocalIndex, E))
+    unsigned LocalIndex;
+
+    if (T)
+      LocalIndex = this->allocateLocalPrimitive(Init, *T, false, false);
+    else if (std::optional<unsigned> MaybeIndex = this->allocateLocal(Init))
+      LocalIndex = *MaybeIndex;
+    else
+      return false;
+
+    if (!this->emitGetPtrLocal(LocalIndex, E))
+      return false;
+
+    if (T) {
+      if (!this->visit(Init)) {
         return false;
+      }
+      return this->emitInit(*T, E);
+    } else {
       if (!this->visitInitializer(Init))
         return false;
-      if (DiscardResult)
-        return this->emitPopPtr(E);
-      return true;
     }
+
+    if (DiscardResult)
+      return this->emitPopPtr(E);
+    return true;
   }
 
   return false;
@@ -2151,7 +2167,7 @@ bool ByteCodeExprGen<Emitter>::VisitCXXUuidofExpr(const CXXUuidofExpr *E) {
     }
   }
 
-  return this->emitInitPtr(E);
+  return this->emitFinishInit(E);
 }
 
 template <class Emitter>
@@ -2173,6 +2189,31 @@ bool ByteCodeExprGen<Emitter>::VisitCXXRewrittenBinaryOperator(
   return this->delegate(E->getSemanticForm());
 }
 
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitPseudoObjectExpr(
+    const PseudoObjectExpr *E) {
+
+  for (const Expr *SemE : E->semantics()) {
+    if (auto *OVE = dyn_cast<OpaqueValueExpr>(SemE)) {
+      if (SemE == E->getResultExpr())
+        return false;
+
+      if (OVE->isUnique())
+        continue;
+
+      if (!this->discard(OVE))
+        return false;
+    } else if (SemE == E->getResultExpr()) {
+      if (!this->delegate(SemE))
+        return false;
+    } else {
+      if (!this->discard(SemE))
+        return false;
+    }
+  }
+  return true;
+}
+
 template <class Emitter> bool ByteCodeExprGen<Emitter>::discard(const Expr *E) {
   if (E->containsErrors())
     return false;
@@ -2364,7 +2405,7 @@ bool ByteCodeExprGen<Emitter>::visitZeroRecordInitializer(const Record *R,
       return false;
     if (!this->visitZeroRecordInitializer(B.R, E))
       return false;
-    if (!this->emitInitPtrPop(E))
+    if (!this->emitFinishInitPop(E))
       return false;
   }
 
@@ -2544,9 +2585,12 @@ bool ByteCodeExprGen<Emitter>::visitExpr(const Expr *E) {
     if (!visitInitializer(E))
       return false;
 
-    if (!this->emitInitPtr(E))
+    if (!this->emitFinishInit(E))
       return false;
-    return this->emitRetValue(E);
+    // We are destroying the locals AFTER the Ret op.
+    // The Ret op needs to copy the (alive) values, but the
+    // destructors may still turn the entire expression invalid.
+    return this->emitRetValue(E) && RootScope.destroyLocals();
   }
 
   return false;
@@ -3373,14 +3417,15 @@ bool ByteCodeExprGen<Emitter>::emitRecordDestruction(const Record *R) {
   // Now emit the destructor and recurse into base classes.
   if (const CXXDestructorDecl *Dtor = R->getDestructor();
       Dtor && !Dtor->isTrivial()) {
-    if (const Function *DtorFunc = getFunction(Dtor)) {
-      assert(DtorFunc->hasThisPointer());
-      assert(DtorFunc->getNumParams() == 1);
-      if (!this->emitDupPtr(SourceInfo{}))
-        return false;
-      if (!this->emitCall(DtorFunc, 0, SourceInfo{}))
-        return false;
-    }
+    const Function *DtorFunc = getFunction(Dtor);
+    if (!DtorFunc)
+      return false;
+    assert(DtorFunc->hasThisPointer());
+    assert(DtorFunc->getNumParams() == 1);
+    if (!this->emitDupPtr(SourceInfo{}))
+      return false;
+    if (!this->emitCall(DtorFunc, 0, SourceInfo{}))
+      return false;
   }
 
   for (const Record::Base &Base : llvm::reverse(R->bases())) {
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index 8f7a0c2fc3c1..acbbcc3dc961 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -117,6 +117,7 @@ public:
   bool VisitRequiresExpr(const RequiresExpr *E);
   bool VisitConceptSpecializationExpr(const ConceptSpecializationExpr *E);
   bool VisitCXXRewrittenBinaryOperator(const CXXRewrittenBinaryOperator *E);
+  bool VisitPseudoObjectExpr(const PseudoObjectExpr *E);
 
 protected:
   bool visitExpr(const Expr *E) override;
@@ -182,7 +183,7 @@ protected:
     if (!visitInitializer(Init))
       return false;
 
-    if (!this->emitInitPtr(Init))
+    if (!this->emitFinishInit(Init))
       return false;
 
     return this->emitPopPtr(Init);
@@ -196,7 +197,7 @@ protected:
     if (!visitInitializer(Init))
       return false;
 
-    if (!this->emitInitPtr(Init))
+    if (!this->emitFinishInit(Init))
       return false;
 
     return this->emitPopPtr(Init);
@@ -210,7 +211,7 @@ protected:
     if (!visitInitializer(I))
       return false;
 
-    return this->emitInitPtrPop(I);
+    return this->emitFinishInitPop(I);
   }
 
   bool visitInitList(ArrayRef<const Expr *> Inits, const Expr *E);
@@ -331,7 +332,7 @@ public:
   }
 
   virtual void emitDestruction() {}
-  virtual void emitDestructors() {}
+  virtual bool emitDestructors() { return true; }
   VariableScope *getParent() const { return Parent; }
 
 protected:
@@ -355,13 +356,18 @@ public:
   }
 
   /// Overriden to support explicit destruction.
-  void emitDestruction() override {
+  void emitDestruction() override { destroyLocals(); }
+
+  /// Explicit destruction of local variables.
+  bool destroyLocals() {
     if (!Idx)
-      return;
-    this->emitDestructors();
+      return true;
+
+    bool Success = this->emitDestructors();
     this->Ctx->emitDestroy(*Idx, SourceInfo{});
     removeStoredOpaqueValues();
     this->Idx = std::nullopt;
+    return Success;
   }
 
   void addLocal(const Scope::Local &Local) override {
@@ -373,19 +379,25 @@ public:
     this->Ctx->Descriptors[*Idx].emplace_back(Local);
   }
 
-  void emitDestructors() override {
+  bool emitDestructors() override {
     if (!Idx)
-      return;
+      return true;
     // Emit destructor calls for local variables of record
     // type with a destructor.
     for (Scope::Local &Local : this->Ctx->Descriptors[*Idx]) {
       if (!Local.Desc->isPrimitive() && !Local.Desc->isPrimitiveArray()) {
-        this->Ctx->emitGetPtrLocal(Local.Offset, SourceInfo{});
-        this->Ctx->emitDestruction(Local.Desc);
-        this->Ctx->emitPopPtr(SourceInfo{});
+        if (!this->Ctx->emitGetPtrLocal(Local.Offset, SourceInfo{}))
+          return false;
+
+        if (!this->Ctx->emitDestruction(Local.Desc))
+          return false;
+
+        if (!this->Ctx->emitPopPtr(SourceInfo{}))
+          return false;
         removeIfStoredOpaqueValue(Local);
       }
     }
+    return true;
   }
 
   void removeStoredOpaqueValues() {
diff --git a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
index 7e2043f8de90..d9213b12cbd0 100644
--- a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
@@ -200,7 +200,7 @@ bool ByteCodeStmtGen<Emitter>::visitFunc(const FunctionDecl *F) {
           return false;
         if (!this->visitInitializer(InitExpr))
           return false;
-        if (!this->emitInitPtrPop(InitExpr))
+        if (!this->emitFinishInitPop(InitExpr))
           return false;
       } else if (const IndirectFieldDecl *IFD = Init->getIndirectMember()) {
         assert(IFD->getChainingSize() >= 2);
diff --git a/clang/lib/AST/Interp/EvalEmitter.cpp b/clang/lib/AST/Interp/EvalEmitter.cpp
index 9cae25f5c4d6..c9c2bf9b145b 100644
--- a/clang/lib/AST/Interp/EvalEmitter.cpp
+++ b/clang/lib/AST/Interp/EvalEmitter.cpp
@@ -38,8 +38,11 @@ EvaluationResult EvalEmitter::interpretExpr(const Expr *E,
   this->ConvertResultToRValue = ConvertResultToRValue;
   EvalResult.setSource(E);
 
-  if (!this->visitExpr(E) && EvalResult.empty())
+  if (!this->visitExpr(E)) {
+    // EvalResult may already have a result set, but something failed
+    // after that (e.g. evaluating destructors).
     EvalResult.setInvalid();
+  }
 
   return std::move(this->EvalResult);
 }
diff --git a/clang/lib/AST/Interp/EvaluationResult.cpp b/clang/lib/AST/Interp/EvaluationResult.cpp
index 693ae9ce4a94..07b28d07326f 100644
--- a/clang/lib/AST/Interp/EvaluationResult.cpp
+++ b/clang/lib/AST/Interp/EvaluationResult.cpp
@@ -105,6 +105,8 @@ static bool CheckFieldsInitialized(InterpState &S, SourceLocation Loc,
       Result &= CheckFieldsInitialized(S, Loc, FieldPtr, FieldPtr.getRecord());
     } else if (FieldType->isIncompleteArrayType()) {
       // Nothing to do here.
+    } else if (F.Decl->isUnnamedBitfield()) {
+      // Nothing do do here.
     } else if (FieldType->isArrayType()) {
       const auto *CAT =
           cast<ConstantArrayType>(FieldType->getAsArrayTypeUnsafe());
diff --git a/clang/lib/AST/Interp/EvaluationResult.h b/clang/lib/AST/Interp/EvaluationResult.h
index 28e1ae6ba3e7..ecf2250074cc 100644
--- a/clang/lib/AST/Interp/EvaluationResult.h
+++ b/clang/lib/AST/Interp/EvaluationResult.h
@@ -72,7 +72,8 @@ private:
     Kind = LValue;
   }
   void setInvalid() {
-    assert(empty());
+    // We are NOT asserting empty() here, since setting it to invalid
+    // is allowed even if there is already a result.
     Kind = Invalid;
   }
   void setValid() {
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 2b36a05e1af9..13e004371f91 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -1280,14 +1280,14 @@ inline bool GetPtrThisBase(InterpState &S, CodePtr OpPC, uint32_t Off) {
   return true;
 }
 
-inline bool InitPtrPop(InterpState &S, CodePtr OpPC) {
+inline bool FinishInitPop(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
   if (Ptr.canBeInitialized())
     Ptr.initialize();
   return true;
 }
 
-inline bool InitPtr(InterpState &S, CodePtr OpPC) {
+inline bool FinishInit(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.peek<Pointer>();
 
   if (Ptr.canBeInitialized())
@@ -1400,6 +1400,19 @@ bool StoreBitFieldPop(InterpState &S, CodePtr OpPC) {
 }
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>
+bool Init(InterpState &S, CodePtr OpPC) {
+  const T &Value = S.Stk.pop<T>();
+  const Pointer &Ptr = S.Stk.peek<Pointer>();
+  if (!CheckInit(S, OpPC, Ptr)) {
+    assert(false);
+    return false;
+  }
+  Ptr.initialize();
+  new (&Ptr.deref<T>()) T(Value);
+  return true;
+}
+
+template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool InitPop(InterpState &S, CodePtr OpPC) {
   const T &Value = S.Stk.pop<T>();
   const Pointer &Ptr = S.Stk.pop<Pointer>();
@@ -1667,7 +1680,7 @@ bool CastFloatingIntegral(InterpState &S, CodePtr OpPC) {
     auto Status = F.convertToInteger(Result);
 
     // Float-to-Integral overflow check.
-    if ((Status & APFloat::opStatus::opInvalidOp) && F.isFinite()) {
+    if ((Status & APFloat::opStatus::opInvalidOp)) {
       const Expr *E = S.Current->getExpr(OpPC);
       QualType Type = E->getType();
 
@@ -1920,7 +1933,7 @@ inline bool ArrayElemPop(InterpState &S, CodePtr OpPC, uint32_t Index) {
 inline bool ArrayDecay(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
-  if (Ptr.isDummy()) {
+  if (Ptr.isZero() || Ptr.isDummy()) {
     S.Stk.push<Pointer>(Ptr);
     return true;
   }
diff --git a/clang/lib/AST/Interp/InterpBuiltin.cpp b/clang/lib/AST/Interp/InterpBuiltin.cpp
index 8f45c789296b..4ba518e5f061 100644
--- a/clang/lib/AST/Interp/InterpBuiltin.cpp
+++ b/clang/lib/AST/Interp/InterpBuiltin.cpp
@@ -85,7 +85,8 @@ static void pushInteger(InterpState &S, T Val, QualType QT) {
     pushInteger(S, APSInt(Val, !std::is_signed_v<T>), QT);
   else
     pushInteger(S,
-                APSInt(APInt(sizeof(T) * 8, Val, std::is_signed_v<T>),
+                APSInt(APInt(sizeof(T) * 8, static_cast<uint64_t>(Val),
+                             std::is_signed_v<T>),
                        !std::is_signed_v<T>),
                 QT);
 }
@@ -464,7 +465,7 @@ static bool interp__builtin_popcount(InterpState &S, CodePtr OpPC,
 
   PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType());
   APSInt Val = peekToAPSInt(S.Stk, ArgT);
-  pushInteger(S, APSInt(APInt(32, Val.popcount())), Call->getType());
+  pushInteger(S, Val.popcount(), Call->getType());
   return true;
 }
 
@@ -492,7 +493,6 @@ static bool interp__builtin_bitreverse(InterpState &S, CodePtr OpPC,
                                        const CallExpr *Call) {
   PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType());
   APSInt Val = peekToAPSInt(S.Stk, ArgT);
-  // pushAPSInt(S, APSInt(Val.reverseBits(), /*IsUnsigned=*/true));
   pushInteger(S, Val.reverseBits(), Call->getType());
   return true;
 }
@@ -551,7 +551,6 @@ static bool interp__builtin_rotate(InterpState &S, CodePtr OpPC,
     Result = APSInt(Value.rotl(Amount.urem(Value.getBitWidth())),
                     /*IsUnsigned=*/true);
 
-  // pushAPSInt(S, Result);
   pushInteger(S, Result, Call->getType());
   return true;
 }
@@ -784,7 +783,6 @@ static bool interp__builtin_carryop(InterpState &S, CodePtr OpPC,
   CarryOutPtr.initialize();
 
   assert(Call->getType() == Call->getArg(0)->getType());
-  // pushAPSInt(S, Result);
   pushInteger(S, Result, Call->getType());
   return true;
 }
@@ -805,7 +803,7 @@ static bool interp__builtin_clz(InterpState &S, CodePtr OpPC,
   if (ZeroIsUndefined && Val == 0)
     return false;
 
-  pushInteger(S, APSInt(APInt(32, Val.countl_zero())), Call->getType());
+  pushInteger(S, Val.countl_zero(), Call->getType());
   return true;
 }
 
diff --git a/clang/lib/AST/Interp/Opcodes.td b/clang/lib/AST/Interp/Opcodes.td
index e36c42d450fc..3e3ba1b163e3 100644
--- a/clang/lib/AST/Interp/Opcodes.td
+++ b/clang/lib/AST/Interp/Opcodes.td
@@ -326,8 +326,8 @@ def GetPtrBasePop : Opcode {
   let Args = [ArgUint32];
 }
 
-def InitPtrPop : Opcode;
-def InitPtr : Opcode;
+def FinishInitPop : Opcode;
+def FinishInit    : Opcode;
 
 def GetPtrDerivedPop : Opcode {
   let Args = [ArgUint32];
@@ -476,6 +476,7 @@ def StoreBitField : StoreBitFieldOpcode {}
 def StoreBitFieldPop : StoreBitFieldOpcode {}
 
 // [Pointer, Value] -> []
+def Init : StoreOpcode {}
 def InitPop : StoreOpcode {}
 // [Pointer, Value] -> [Pointer]
 def InitElem : Opcode {
diff --git a/clang/lib/AST/Interp/Pointer.h b/clang/lib/AST/Interp/Pointer.h
index fa2e03d71190..34ecdb967960 100644
--- a/clang/lib/AST/Interp/Pointer.h
+++ b/clang/lib/AST/Interp/Pointer.h
@@ -215,7 +215,6 @@ public:
       assert(Offset == PastEndMark && "cannot get base of a block");
       return Pointer(Pointee, Base, 0);
     }
-    assert(Offset == Base && "not an inner field");
     unsigned NewBase = Base - getInlineDesc()->Offset;
     return Pointer(Pointee, NewBase, NewBase);
   }
diff --git a/clang/lib/AST/PrintfFormatString.cpp b/clang/lib/AST/PrintfFormatString.cpp
index 3b09ca40bd2a..fec8ce13e8c4 100644
--- a/clang/lib/AST/PrintfFormatString.cpp
+++ b/clang/lib/AST/PrintfFormatString.cpp
@@ -348,6 +348,8 @@ static PrintfSpecifierResult ParsePrintfSpecifier(FormatStringHandler &H,
     case 'r':
       if (isFreeBSDKPrintf)
         k = ConversionSpecifier::FreeBSDrArg; // int
+      else if (LO.FixedPoint)
+        k = ConversionSpecifier::rArg;
       break;
     case 'y':
       if (isFreeBSDKPrintf)
@@ -373,6 +375,20 @@ static PrintfSpecifierResult ParsePrintfSpecifier(FormatStringHandler &H,
       if (Target.getTriple().isOSMSVCRT())
         k = ConversionSpecifier::ZArg;
       break;
+    // ISO/IEC TR 18037 (fixed-point) specific.
+    // NOTE: 'r' is handled up above since FreeBSD also supports %r.
+    case 'k':
+      if (LO.FixedPoint)
+        k = ConversionSpecifier::kArg;
+      break;
+    case 'K':
+      if (LO.FixedPoint)
+        k = ConversionSpecifier::KArg;
+      break;
+    case 'R':
+      if (LO.FixedPoint)
+        k = ConversionSpecifier::RArg;
+      break;
   }
 
   // Check to see if we used the Objective-C modifier flags with
@@ -627,6 +643,9 @@ ArgType PrintfSpecifier::getScalarArgType(ASTContext &Ctx,
     }
   }
 
+  if (CS.isFixedPointArg() && !Ctx.getLangOpts().FixedPoint)
+    return ArgType::Invalid();
+
   switch (CS.getKind()) {
     case ConversionSpecifier::sArg:
       if (LM.getKind() == LengthModifier::AsWideChar) {
@@ -658,6 +677,50 @@ ArgType PrintfSpecifier::getScalarArgType(ASTContext &Ctx,
       return ArgType::CPointerTy;
     case ConversionSpecifier::ObjCObjArg:
       return ArgType::ObjCPointerTy;
+    case ConversionSpecifier::kArg:
+      switch (LM.getKind()) {
+      case LengthModifier::None:
+        return Ctx.AccumTy;
+      case LengthModifier::AsShort:
+        return Ctx.ShortAccumTy;
+      case LengthModifier::AsLong:
+        return Ctx.LongAccumTy;
+      default:
+        return ArgType::Invalid();
+      }
+    case ConversionSpecifier::KArg:
+      switch (LM.getKind()) {
+      case LengthModifier::None:
+        return Ctx.UnsignedAccumTy;
+      case LengthModifier::AsShort:
+        return Ctx.UnsignedShortAccumTy;
+      case LengthModifier::AsLong:
+        return Ctx.UnsignedLongAccumTy;
+      default:
+        return ArgType::Invalid();
+      }
+    case ConversionSpecifier::rArg:
+      switch (LM.getKind()) {
+      case LengthModifier::None:
+        return Ctx.FractTy;
+      case LengthModifier::AsShort:
+        return Ctx.ShortFractTy;
+      case LengthModifier::AsLong:
+        return Ctx.LongFractTy;
+      default:
+        return ArgType::Invalid();
+      }
+    case ConversionSpecifier::RArg:
+      switch (LM.getKind()) {
+      case LengthModifier::None:
+        return Ctx.UnsignedFractTy;
+      case LengthModifier::AsShort:
+        return Ctx.UnsignedShortFractTy;
+      case LengthModifier::AsLong:
+        return Ctx.UnsignedLongFractTy;
+      default:
+        return ArgType::Invalid();
+      }
     default:
       break;
   }
@@ -955,6 +1018,8 @@ bool PrintfSpecifier::hasValidPlusPrefix() const {
   case ConversionSpecifier::AArg:
   case ConversionSpecifier::FreeBSDrArg:
   case ConversionSpecifier::FreeBSDyArg:
+  case ConversionSpecifier::rArg:
+  case ConversionSpecifier::kArg:
     return true;
 
   default:
@@ -966,7 +1031,7 @@ bool PrintfSpecifier::hasValidAlternativeForm() const {
   if (!HasAlternativeForm)
     return true;
 
-  // Alternate form flag only valid with the bBoxXaAeEfFgG conversions
+  // Alternate form flag only valid with the bBoxXaAeEfFgGrRkK conversions
   switch (CS.getKind()) {
   case ConversionSpecifier::bArg:
   case ConversionSpecifier::BArg:
@@ -984,6 +1049,10 @@ bool PrintfSpecifier::hasValidAlternativeForm() const {
   case ConversionSpecifier::GArg:
   case ConversionSpecifier::FreeBSDrArg:
   case ConversionSpecifier::FreeBSDyArg:
+  case ConversionSpecifier::rArg:
+  case ConversionSpecifier::RArg:
+  case ConversionSpecifier::kArg:
+  case ConversionSpecifier::KArg:
     return true;
 
   default:
@@ -995,7 +1064,7 @@ bool PrintfSpecifier::hasValidLeadingZeros() const {
   if (!HasLeadingZeroes)
     return true;
 
-  // Leading zeroes flag only valid with the bBdiouxXaAeEfFgG conversions
+  // Leading zeroes flag only valid with the bBdiouxXaAeEfFgGrRkK conversions
   switch (CS.getKind()) {
   case ConversionSpecifier::bArg:
   case ConversionSpecifier::BArg:
@@ -1018,6 +1087,10 @@ bool PrintfSpecifier::hasValidLeadingZeros() const {
   case ConversionSpecifier::GArg:
   case ConversionSpecifier::FreeBSDrArg:
   case ConversionSpecifier::FreeBSDyArg:
+  case ConversionSpecifier::rArg:
+  case ConversionSpecifier::RArg:
+  case ConversionSpecifier::kArg:
+  case ConversionSpecifier::KArg:
     return true;
 
   default:
@@ -1044,6 +1117,8 @@ bool PrintfSpecifier::hasValidSpacePrefix() const {
   case ConversionSpecifier::AArg:
   case ConversionSpecifier::FreeBSDrArg:
   case ConversionSpecifier::FreeBSDyArg:
+  case ConversionSpecifier::rArg:
+  case ConversionSpecifier::kArg:
     return true;
 
   default:
@@ -1089,7 +1164,7 @@ bool PrintfSpecifier::hasValidPrecision() const {
   if (Precision.getHowSpecified() == OptionalAmount::NotSpecified)
     return true;
 
-  // Precision is only valid with the bBdiouxXaAeEfFgGsP conversions
+  // Precision is only valid with the bBdiouxXaAeEfFgGsPrRkK conversions
   switch (CS.getKind()) {
   case ConversionSpecifier::bArg:
   case ConversionSpecifier::BArg:
@@ -1114,6 +1189,10 @@ bool PrintfSpecifier::hasValidPrecision() const {
   case ConversionSpecifier::FreeBSDrArg:
   case ConversionSpecifier::FreeBSDyArg:
   case ConversionSpecifier::PArg:
+  case ConversionSpecifier::rArg:
+  case ConversionSpecifier::RArg:
+  case ConversionSpecifier::kArg:
+  case ConversionSpecifier::KArg:
     return true;
 
   default:
diff --git a/clang/lib/Analysis/ThreadSafetyCommon.cpp b/clang/lib/Analysis/ThreadSafetyCommon.cpp
index 2fe0f85897c3..33f1f466df24 100644
--- a/clang/lib/Analysis/ThreadSafetyCommon.cpp
+++ b/clang/lib/Analysis/ThreadSafetyCommon.cpp
@@ -995,7 +995,7 @@ void SExprBuilder::exitCFG(const CFGBlock *Last) {
   IncompleteArgs.clear();
 }
 
-/*
+#ifndef NDEBUG
 namespace {
 
 class TILPrinter :
@@ -1016,4 +1016,4 @@ void printSCFG(CFGWalker &Walker) {
 
 } // namespace threadSafety
 } // namespace clang
-*/
+#endif // NDEBUG
diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index 701f1ac852c2..e1ff0d92f6b2 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -130,42 +130,42 @@ public:
 
   bool TraverseGenericSelectionExpr(GenericSelectionExpr *Node) {
     // These are unevaluated, except the result expression.
-    if(ignoreUnevaluatedContext)
+    if (ignoreUnevaluatedContext)
       return TraverseStmt(Node->getResultExpr());
     return VisitorBase::TraverseGenericSelectionExpr(Node);
   }
 
   bool TraverseUnaryExprOrTypeTraitExpr(UnaryExprOrTypeTraitExpr *Node) {
     // Unevaluated context.
-    if(ignoreUnevaluatedContext)
+    if (ignoreUnevaluatedContext)
       return true;
     return VisitorBase::TraverseUnaryExprOrTypeTraitExpr(Node);
   }
 
   bool TraverseTypeOfExprTypeLoc(TypeOfExprTypeLoc Node) {
     // Unevaluated context.
-    if(ignoreUnevaluatedContext)
+    if (ignoreUnevaluatedContext)
       return true;
     return VisitorBase::TraverseTypeOfExprTypeLoc(Node);
   }
 
   bool TraverseDecltypeTypeLoc(DecltypeTypeLoc Node) {
     // Unevaluated context.
-    if(ignoreUnevaluatedContext)
+    if (ignoreUnevaluatedContext)
       return true;
     return VisitorBase::TraverseDecltypeTypeLoc(Node);
   }
 
   bool TraverseCXXNoexceptExpr(CXXNoexceptExpr *Node) {
     // Unevaluated context.
-    if(ignoreUnevaluatedContext)
+    if (ignoreUnevaluatedContext)
       return true;
     return VisitorBase::TraverseCXXNoexceptExpr(Node);
   }
 
   bool TraverseCXXTypeidExpr(CXXTypeidExpr *Node) {
     // Unevaluated context.
-    if(ignoreUnevaluatedContext)
+    if (ignoreUnevaluatedContext)
       return true;
     return VisitorBase::TraverseCXXTypeidExpr(Node);
   }
@@ -213,24 +213,26 @@ private:
 
 // Because we're dealing with raw pointers, let's define what we mean by that.
 static auto hasPointerType() {
-    return hasType(hasCanonicalType(pointerType()));
+  return hasType(hasCanonicalType(pointerType()));
 }
 
-static auto hasArrayType() {
-    return hasType(hasCanonicalType(arrayType()));
-}
+static auto hasArrayType() { return hasType(hasCanonicalType(arrayType())); }
 
-AST_MATCHER_P(Stmt, forEachDescendantEvaluatedStmt, internal::Matcher<Stmt>, innerMatcher) {
+AST_MATCHER_P(Stmt, forEachDescendantEvaluatedStmt, internal::Matcher<Stmt>,
+              innerMatcher) {
   const DynTypedMatcher &DTM = static_cast<DynTypedMatcher>(innerMatcher);
 
-  MatchDescendantVisitor Visitor(&DTM, Finder, Builder, ASTMatchFinder::BK_All, true);
+  MatchDescendantVisitor Visitor(&DTM, Finder, Builder, ASTMatchFinder::BK_All,
+                                 true);
   return Visitor.findMatch(DynTypedNode::create(Node));
 }
 
-AST_MATCHER_P(Stmt, forEachDescendantStmt, internal::Matcher<Stmt>, innerMatcher) {
+AST_MATCHER_P(Stmt, forEachDescendantStmt, internal::Matcher<Stmt>,
+              innerMatcher) {
   const DynTypedMatcher &DTM = static_cast<DynTypedMatcher>(innerMatcher);
 
-  MatchDescendantVisitor Visitor(&DTM, Finder, Builder, ASTMatchFinder::BK_All, false);
+  MatchDescendantVisitor Visitor(&DTM, Finder, Builder, ASTMatchFinder::BK_All,
+                                 false);
   return Visitor.findMatch(DynTypedNode::create(Node));
 }
 
@@ -268,10 +270,9 @@ static auto isInUnspecifiedLvalueContext(internal::Matcher<Expr> innerMatcher) {
         hasLHS(innerMatcher)
       )
     ));
-// clang-format on
+  // clang-format on
 }
 
-
 // Returns a matcher that matches any expression `e` such that `InnerMatcher`
 // matches `e` and `e` is in an Unspecified Pointer Context (UPC).
 static internal::Matcher<Stmt>
@@ -315,7 +316,7 @@ isInUnspecifiedPointerContext(internal::Matcher<Stmt> InnerMatcher) {
   // clang-format on
 
   return stmt(anyOf(CallArgMatcher, CastOperandMatcher, CompOperandMatcher,
-		    PtrSubtractionMatcher));
+                    PtrSubtractionMatcher));
   // FIXME: any more cases? (UPC excludes the RHS of an assignment.  For now we
   // don't have to check that.)
 }
@@ -481,7 +482,9 @@ public:
 #ifndef NDEBUG
   StringRef getDebugName() const {
     switch (K) {
-#define GADGET(x) case Kind::x: return #x;
+#define GADGET(x)                                                              \
+  case Kind::x:                                                                \
+    return #x;
 #include "clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def"
     }
     llvm_unreachable("Unhandled Gadget::Kind enum");
@@ -502,7 +505,6 @@ private:
   Kind K;
 };
 
-
 /// Warning gadgets correspond to unsafe code patterns that warrants
 /// an immediate warning.
 class WarningGadget : public Gadget {
@@ -513,10 +515,10 @@ public:
   bool isWarningGadget() const final { return true; }
 };
 
-/// Fixable gadgets correspond to code patterns that aren't always unsafe but need to be
-/// properly recognized in order to emit fixes. For example, if a raw pointer-type
-/// variable is replaced by a safe C++ container, every use of such variable must be
-/// carefully considered and possibly updated.
+/// Fixable gadgets correspond to code patterns that aren't always unsafe but
+/// need to be properly recognized in order to emit fixes. For example, if a raw
+/// pointer-type variable is replaced by a safe C++ container, every use of such
+/// variable must be carefully considered and possibly updated.
 class FixableGadget : public Gadget {
 public:
   FixableGadget(Kind K) : Gadget(K) {}
@@ -531,20 +533,19 @@ public:
     return std::nullopt;
   }
 
-  /// Returns a list of two elements where the first element is the LHS of a pointer assignment
-  /// statement and the second element is the RHS. This two-element list represents the fact that
-  /// the LHS buffer gets its bounds information from the RHS buffer. This information will be used
-  /// later to group all those variables whose types must be modified together to prevent type
-  /// mismatches.
+  /// Returns a list of two elements where the first element is the LHS of a
+  /// pointer assignment statement and the second element is the RHS. This
+  /// two-element list represents the fact that the LHS buffer gets its bounds
+  /// information from the RHS buffer. This information will be used later to
+  /// group all those variables whose types must be modified together to prevent
+  /// type mismatches.
   virtual std::optional<std::pair<const VarDecl *, const VarDecl *>>
   getStrategyImplications() const {
     return std::nullopt;
   }
 };
 
-static auto toSupportedVariable() {
-  return to(varDecl());
-}
+static auto toSupportedVariable() { return to(varDecl()); }
 
 using FixableGadgetList = std::vector<std::unique_ptr<FixableGadget>>;
 using WarningGadgetList = std::vector<std::unique_ptr<WarningGadget>>;
@@ -565,10 +566,10 @@ public:
   }
 
   static Matcher matcher() {
-    return stmt(unaryOperator(
-      hasOperatorName("++"),
-      hasUnaryOperand(ignoringParenImpCasts(hasPointerType()))
-    ).bind(OpTag));
+    return stmt(
+        unaryOperator(hasOperatorName("++"),
+                      hasUnaryOperand(ignoringParenImpCasts(hasPointerType())))
+            .bind(OpTag));
   }
 
   const UnaryOperator *getBaseStmt() const override { return Op; }
@@ -600,10 +601,10 @@ public:
   }
 
   static Matcher matcher() {
-    return stmt(unaryOperator(
-      hasOperatorName("--"),
-      hasUnaryOperand(ignoringParenImpCasts(hasPointerType()))
-    ).bind(OpTag));
+    return stmt(
+        unaryOperator(hasOperatorName("--"),
+                      hasUnaryOperand(ignoringParenImpCasts(hasPointerType())))
+            .bind(OpTag));
   }
 
   const UnaryOperator *getBaseStmt() const override { return Op; }
@@ -754,26 +755,25 @@ class PointerInitGadget : public FixableGadget {
 private:
   static constexpr const char *const PointerInitLHSTag = "ptrInitLHS";
   static constexpr const char *const PointerInitRHSTag = "ptrInitRHS";
-  const VarDecl * PtrInitLHS;         // the LHS pointer expression in `PI`
-  const DeclRefExpr * PtrInitRHS;         // the RHS pointer expression in `PI`
+  const VarDecl *PtrInitLHS;     // the LHS pointer expression in `PI`
+  const DeclRefExpr *PtrInitRHS; // the RHS pointer expression in `PI`
 
 public:
   PointerInitGadget(const MatchFinder::MatchResult &Result)
       : FixableGadget(Kind::PointerInit),
-    PtrInitLHS(Result.Nodes.getNodeAs<VarDecl>(PointerInitLHSTag)),
-    PtrInitRHS(Result.Nodes.getNodeAs<DeclRefExpr>(PointerInitRHSTag)) {}
+        PtrInitLHS(Result.Nodes.getNodeAs<VarDecl>(PointerInitLHSTag)),
+        PtrInitRHS(Result.Nodes.getNodeAs<DeclRefExpr>(PointerInitRHSTag)) {}
 
   static bool classof(const Gadget *G) {
     return G->getKind() == Kind::PointerInit;
   }
 
   static Matcher matcher() {
-    auto PtrInitStmt = declStmt(hasSingleDecl(varDecl(
-                                 hasInitializer(ignoringImpCasts(declRefExpr(
-                                                  hasPointerType(),
-                                                    toSupportedVariable()).
-                                                  bind(PointerInitRHSTag)))).
-                                              bind(PointerInitLHSTag)));
+    auto PtrInitStmt = declStmt(hasSingleDecl(
+        varDecl(hasInitializer(ignoringImpCasts(
+                    declRefExpr(hasPointerType(), toSupportedVariable())
+                        .bind(PointerInitRHSTag))))
+            .bind(PointerInitLHSTag)));
 
     return stmt(PtrInitStmt);
   }
@@ -793,8 +793,7 @@ public:
 
   virtual std::optional<std::pair<const VarDecl *, const VarDecl *>>
   getStrategyImplications() const override {
-      return std::make_pair(PtrInitLHS,
-                            cast<VarDecl>(PtrInitRHS->getDecl()));
+    return std::make_pair(PtrInitLHS, cast<VarDecl>(PtrInitRHS->getDecl()));
   }
 };
 
@@ -807,8 +806,8 @@ class PtrToPtrAssignmentGadget : public FixableGadget {
 private:
   static constexpr const char *const PointerAssignLHSTag = "ptrLHS";
   static constexpr const char *const PointerAssignRHSTag = "ptrRHS";
-  const DeclRefExpr * PtrLHS;         // the LHS pointer expression in `PA`
-  const DeclRefExpr * PtrRHS;         // the RHS pointer expression in `PA`
+  const DeclRefExpr *PtrLHS; // the LHS pointer expression in `PA`
+  const DeclRefExpr *PtrRHS; // the RHS pointer expression in `PA`
 
 public:
   PtrToPtrAssignmentGadget(const MatchFinder::MatchResult &Result)
@@ -821,13 +820,13 @@ public:
   }
 
   static Matcher matcher() {
-    auto PtrAssignExpr = binaryOperator(allOf(hasOperatorName("="),
-      hasRHS(ignoringParenImpCasts(declRefExpr(hasPointerType(),
-                                               toSupportedVariable()).
-                                   bind(PointerAssignRHSTag))),
-                                   hasLHS(declRefExpr(hasPointerType(),
-                                                      toSupportedVariable()).
-                                          bind(PointerAssignLHSTag))));
+    auto PtrAssignExpr = binaryOperator(
+        allOf(hasOperatorName("="),
+              hasRHS(ignoringParenImpCasts(
+                  declRefExpr(hasPointerType(), toSupportedVariable())
+                      .bind(PointerAssignRHSTag))),
+              hasLHS(declRefExpr(hasPointerType(), toSupportedVariable())
+                         .bind(PointerAssignLHSTag))));
 
     return stmt(isInUnspecifiedUntypedContext(PtrAssignExpr));
   }
@@ -981,9 +980,8 @@ public:
 
   static Matcher matcher() {
     auto ArrayOrPtr = anyOf(hasPointerType(), hasArrayType());
-    auto BaseIsArrayOrPtrDRE =
-        hasBase(ignoringParenImpCasts(declRefExpr(ArrayOrPtr,
-                                                  toSupportedVariable())));
+    auto BaseIsArrayOrPtrDRE = hasBase(
+        ignoringParenImpCasts(declRefExpr(ArrayOrPtr, toSupportedVariable())));
     auto Target =
         arraySubscriptExpr(BaseIsArrayOrPtrDRE).bind(ULCArraySubscriptTag);
 
@@ -1025,9 +1023,9 @@ public:
 
   static Matcher matcher() {
     auto ArrayOrPtr = anyOf(hasPointerType(), hasArrayType());
-    auto target = expr(
-        ignoringParenImpCasts(declRefExpr(allOf(ArrayOrPtr,
-                              toSupportedVariable())).bind(DeclRefExprTag)));
+    auto target = expr(ignoringParenImpCasts(
+        declRefExpr(allOf(ArrayOrPtr, toSupportedVariable()))
+            .bind(DeclRefExprTag)));
     return stmt(isInUnspecifiedPointerContext(target));
   }
 
@@ -1036,9 +1034,7 @@ public:
 
   virtual const Stmt *getBaseStmt() const override { return Node; }
 
-  virtual DeclUseList getClaimedVarUseSites() const override {
-    return {Node};
-  }
+  virtual DeclUseList getClaimedVarUseSites() const override { return {Node}; }
 };
 
 class PointerDereferenceGadget : public FixableGadget {
@@ -1103,10 +1099,10 @@ public:
 
   static Matcher matcher() {
     return expr(isInUnspecifiedPointerContext(expr(ignoringImpCasts(
-        unaryOperator(hasOperatorName("&"),
-                      hasUnaryOperand(arraySubscriptExpr(
-                          hasBase(ignoringParenImpCasts(declRefExpr(
-                                                  toSupportedVariable()))))))
+        unaryOperator(
+            hasOperatorName("&"),
+            hasUnaryOperand(arraySubscriptExpr(hasBase(
+                ignoringParenImpCasts(declRefExpr(toSupportedVariable()))))))
             .bind(UPCAddressofArraySubscriptTag)))));
   }
 
@@ -1195,13 +1191,13 @@ public:
 class UPCPreIncrementGadget : public FixableGadget {
 private:
   static constexpr const char *const UPCPreIncrementTag =
-    "PointerPreIncrementUnderUPC";
+      "PointerPreIncrementUnderUPC";
   const UnaryOperator *Node; // the `++Ptr` node
 
 public:
   UPCPreIncrementGadget(const MatchFinder::MatchResult &Result)
-    : FixableGadget(Kind::UPCPreIncrement),
-      Node(Result.Nodes.getNodeAs<UnaryOperator>(UPCPreIncrementTag)) {
+      : FixableGadget(Kind::UPCPreIncrement),
+        Node(Result.Nodes.getNodeAs<UnaryOperator>(UPCPreIncrementTag)) {
     assert(Node != nullptr && "Expecting a non-null matching result");
   }
 
@@ -1215,10 +1211,9 @@ public:
     // can have the matcher be general, so long as `getClaimedVarUseSites` does
     // things right.
     return stmt(isInUnspecifiedPointerContext(expr(ignoringImpCasts(
-								    unaryOperator(isPreInc(),
-										  hasUnaryOperand(declRefExpr(
-                                                    toSupportedVariable()))
-										  ).bind(UPCPreIncrementTag)))));
+        unaryOperator(isPreInc(),
+                      hasUnaryOperand(declRefExpr(toSupportedVariable())))
+            .bind(UPCPreIncrementTag)))));
   }
 
   virtual std::optional<FixItList>
@@ -1782,9 +1777,9 @@ static SourceRange getSourceRangeToTokenEnd(const Decl *D,
                                             const LangOptions &LangOpts) {
   SourceLocation Begin = D->getBeginLoc();
   SourceLocation
-    End = // `D->getEndLoc` should always return the starting location of the
-    // last token, so we should get the end of the token
-    Lexer::getLocForEndOfToken(D->getEndLoc(), 0, SM, LangOpts);
+      End = // `D->getEndLoc` should always return the starting location of the
+      // last token, so we should get the end of the token
+      Lexer::getLocForEndOfToken(D->getEndLoc(), 0, SM, LangOpts);
 
   return SourceRange(Begin, End);
 }
@@ -1976,7 +1971,7 @@ PointerDereferenceGadget::getFixits(const FixitStrategy &S) const {
     if (auto LocPastOperand =
             getPastLoc(BaseDeclRefExpr, SM, Ctx.getLangOpts())) {
       return FixItList{{FixItHint::CreateRemoval(derefRange),
-			FixItHint::CreateInsertion(*LocPastOperand, "[0]")}};
+                        FixItHint::CreateInsertion(*LocPastOperand, "[0]")}};
     }
     break;
   }
@@ -2162,7 +2157,8 @@ FixVarInitializerWithSpan(const Expr *Init, ASTContext &Ctx,
   // NULL pointer, we use the default constructor to initialize the span
   // object, i.e., a `std:span` variable declaration with no initializer.
   // So the fix-it is just to remove the initializer.
-  if (Init->isNullPointerConstant(Ctx,
+  if (Init->isNullPointerConstant(
+          Ctx,
           // FIXME: Why does this function not ask for `const ASTContext
           // &`? It should. Maybe worth an NFC patch later.
           Expr::NullPointerConstantValueDependence::
@@ -2230,8 +2226,10 @@ FixVarInitializerWithSpan(const Expr *Init, ASTContext &Ctx,
 }
 
 #ifndef NDEBUG
-#define DEBUG_NOTE_DECL_FAIL(D, Msg)  \
-Handler.addDebugNoteForVar((D), (D)->getBeginLoc(), "failed to produce fixit for declaration '" + (D)->getNameAsString() + "'" + (Msg))
+#define DEBUG_NOTE_DECL_FAIL(D, Msg)                                           \
+  Handler.addDebugNoteForVar((D), (D)->getBeginLoc(),                          \
+                             "failed to produce fixit for declaration '" +     \
+                                 (D)->getNameAsString() + "'" + (Msg))
 #else
 #define DEBUG_NOTE_DECL_FAIL(D, Msg)
 #endif
@@ -2239,8 +2237,8 @@ Handler.addDebugNoteForVar((D), (D)->getBeginLoc(), "failed to produce fixit for
 // For the given variable declaration with a pointer-to-T type, returns the text
 // `std::span<T>`.  If it is unable to generate the text, returns
 // `std::nullopt`.
-static std::optional<std::string> createSpanTypeForVarDecl(const VarDecl *VD,
-                                                           const ASTContext &Ctx) {
+static std::optional<std::string>
+createSpanTypeForVarDecl(const VarDecl *VD, const ASTContext &Ctx) {
   assert(VD->getType()->isPointerType());
 
   std::optional<Qualifiers> PteTyQualifiers = std::nullopt;
@@ -2277,8 +2275,8 @@ static std::optional<std::string> createSpanTypeForVarDecl(const VarDecl *VD,
 //    the non-empty fix-it list, if fix-its are successfuly generated; empty
 //    list otherwise.
 static FixItList fixLocalVarDeclWithSpan(const VarDecl *D, ASTContext &Ctx,
-					 const StringRef UserFillPlaceHolder,
-					 UnsafeBufferUsageHandler &Handler) {
+                                         const StringRef UserFillPlaceHolder,
+                                         UnsafeBufferUsageHandler &Handler) {
   if (hasUnsupportedSpecifiers(D, Ctx.getSourceManager()))
     return {};
 
@@ -2431,9 +2429,9 @@ createOverloadsForFixedParams(const FixitStrategy &S, const FunctionDecl *FD,
         // print parameter name if provided:
         if (IdentifierInfo *II = Parm->getIdentifier())
           SS << ' ' << II->getName().str();
-      } else if (auto ParmTypeText = getRangeText(
-                     getSourceRangeToTokenEnd(Parm, SM, LangOpts),
-                     SM, LangOpts)) {
+      } else if (auto ParmTypeText =
+                     getRangeText(getSourceRangeToTokenEnd(Parm, SM, LangOpts),
+                                  SM, LangOpts)) {
         // print the whole `Parm` without modification:
         SS << ParmTypeText->str();
       } else
@@ -2577,7 +2575,8 @@ static FixItList fixVariableWithSpan(const VarDecl *VD,
                                      UnsafeBufferUsageHandler &Handler) {
   const DeclStmt *DS = Tracker.lookupDecl(VD);
   if (!DS) {
-    DEBUG_NOTE_DECL_FAIL(VD, " : variables declared this way not implemented yet");
+    DEBUG_NOTE_DECL_FAIL(VD,
+                         " : variables declared this way not implemented yet");
     return {};
   }
   if (!DS->isSingleDecl()) {
@@ -2979,8 +2978,8 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
 #endif
 
   assert(D && D->getBody());
-  // We do not want to visit a Lambda expression defined inside a method independently.
-  // Instead, it should be visited along with the outer method.
+  // We do not want to visit a Lambda expression defined inside a method
+  // independently. Instead, it should be visited along with the outer method.
   // FIXME: do we want to do the same thing for `BlockDecl`s?
   if (const auto *fd = dyn_cast<CXXMethodDecl>(D)) {
     if (fd->getParent()->isLambda() && fd->getParent()->isLocalClass())
@@ -2990,7 +2989,7 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
   // Do not emit fixit suggestions for functions declared in an
   // extern "C" block.
   if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
-      for (FunctionDecl *FReDecl : FD->redecls()) {
+    for (FunctionDecl *FReDecl : FD->redecls()) {
       if (FReDecl->isExternC()) {
         EmitSuggestions = false;
         break;
@@ -3002,7 +3001,7 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
   FixableGadgetSets FixablesForAllVars;
 
   auto [FixableGadgets, WarningGadgets, Tracker] =
-    findGadgets(D, Handler, EmitSuggestions);
+      findGadgets(D, Handler, EmitSuggestions);
 
   if (!EmitSuggestions) {
     // Our job is very easy without suggestions. Just warn about
@@ -3055,36 +3054,36 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
   // Filter out non-local vars and vars with unclaimed DeclRefExpr-s.
   for (auto it = FixablesForAllVars.byVar.cbegin();
        it != FixablesForAllVars.byVar.cend();) {
-      // FIXME: need to deal with global variables later
-      if ((!it->first->isLocalVarDecl() && !isa<ParmVarDecl>(it->first))) {
+    // FIXME: need to deal with global variables later
+    if ((!it->first->isLocalVarDecl() && !isa<ParmVarDecl>(it->first))) {
 #ifndef NDEBUG
-          Handler.addDebugNoteForVar(
-              it->first, it->first->getBeginLoc(),
-              ("failed to produce fixit for '" + it->first->getNameAsString() +
-               "' : neither local nor a parameter"));
+      Handler.addDebugNoteForVar(it->first, it->first->getBeginLoc(),
+                                 ("failed to produce fixit for '" +
+                                  it->first->getNameAsString() +
+                                  "' : neither local nor a parameter"));
 #endif
-        it = FixablesForAllVars.byVar.erase(it);
-      } else if (it->first->getType().getCanonicalType()->isReferenceType()) {
+      it = FixablesForAllVars.byVar.erase(it);
+    } else if (it->first->getType().getCanonicalType()->isReferenceType()) {
 #ifndef NDEBUG
-        Handler.addDebugNoteForVar(it->first, it->first->getBeginLoc(),
-                                   ("failed to produce fixit for '" +
-                                    it->first->getNameAsString() +
-                                    "' : has a reference type"));
+      Handler.addDebugNoteForVar(it->first, it->first->getBeginLoc(),
+                                 ("failed to produce fixit for '" +
+                                  it->first->getNameAsString() +
+                                  "' : has a reference type"));
 #endif
-        it = FixablesForAllVars.byVar.erase(it);
-      } else if (Tracker.hasUnclaimedUses(it->first)) {
-        it = FixablesForAllVars.byVar.erase(it);
-      } else if (it->first->isInitCapture()) {
+      it = FixablesForAllVars.byVar.erase(it);
+    } else if (Tracker.hasUnclaimedUses(it->first)) {
+      it = FixablesForAllVars.byVar.erase(it);
+    } else if (it->first->isInitCapture()) {
 #ifndef NDEBUG
-        Handler.addDebugNoteForVar(
-            it->first, it->first->getBeginLoc(),
-                                   ("failed to produce fixit for '" + it->first->getNameAsString() +
-                                    "' : init capture"));
+      Handler.addDebugNoteForVar(it->first, it->first->getBeginLoc(),
+                                 ("failed to produce fixit for '" +
+                                  it->first->getNameAsString() +
+                                  "' : init capture"));
 #endif
-        it = FixablesForAllVars.byVar.erase(it);
-      } else {
-        ++it;
-      }
+      it = FixablesForAllVars.byVar.erase(it);
+    } else {
+      ++it;
+    }
   }
 
 #ifndef NDEBUG
@@ -3115,7 +3114,7 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
   for (auto it : FixablesForAllVars.byVar) {
     for (const FixableGadget *fixable : it.second) {
       std::optional<std::pair<const VarDecl *, const VarDecl *>> ImplPair =
-                                  fixable->getStrategyImplications();
+          fixable->getStrategyImplications();
       if (ImplPair) {
         std::pair<const VarDecl *, const VarDecl *> Impl = std::move(*ImplPair);
         PtrAssignmentGraph[Impl.first].insert(Impl.second);
@@ -3144,10 +3143,10 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
   for (const auto &[Var, ignore] : UnsafeOps.byVar) {
     if (VisitedVarsDirected.find(Var) == VisitedVarsDirected.end()) {
 
-      std::queue<const VarDecl*> QueueDirected{};
+      std::queue<const VarDecl *> QueueDirected{};
       QueueDirected.push(Var);
-      while(!QueueDirected.empty()) {
-        const VarDecl* CurrentVar = QueueDirected.front();
+      while (!QueueDirected.empty()) {
+        const VarDecl *CurrentVar = QueueDirected.front();
         QueueDirected.pop();
         VisitedVarsDirected.insert(CurrentVar);
         auto AdjacentNodes = PtrAssignmentGraph[CurrentVar];
@@ -3178,11 +3177,11 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
   for (const auto &[Var, ignore] : UnsafeOps.byVar) {
     if (VisitedVars.find(Var) == VisitedVars.end()) {
       VarGrpTy &VarGroup = Groups.emplace_back();
-      std::queue<const VarDecl*> Queue{};
+      std::queue<const VarDecl *> Queue{};
 
       Queue.push(Var);
-      while(!Queue.empty()) {
-        const VarDecl* CurrentVar = Queue.front();
+      while (!Queue.empty()) {
+        const VarDecl *CurrentVar = Queue.front();
         Queue.pop();
         VisitedVars.insert(CurrentVar);
         VarGroup.push_back(CurrentVar);
diff --git a/clang/lib/Basic/Targets/Mips.h b/clang/lib/Basic/Targets/Mips.h
index f46b95abfd75..23d4e1b598fa 100644
--- a/clang/lib/Basic/Targets/Mips.h
+++ b/clang/lib/Basic/Targets/Mips.h
@@ -237,12 +237,14 @@ public:
     case 'r': // CPU registers.
     case 'd': // Equivalent to "r" unless generating MIPS16 code.
     case 'y': // Equivalent to "r", backward compatibility only.
-    case 'f': // floating-point registers.
     case 'c': // $25 for indirect jumps
     case 'l': // lo register
     case 'x': // hilo register pair
       Info.setAllowsRegister();
       return true;
+    case 'f': // floating-point registers.
+      Info.setAllowsRegister();
+      return FloatABI != SoftFloat;
     case 'I': // Signed 16-bit constant
     case 'J': // Integer 0
     case 'K': // Unsigned 16-bit constant
diff --git a/clang/lib/CodeGen/ABIInfo.cpp b/clang/lib/CodeGen/ABIInfo.cpp
index 1b56cf7c596d..efcff958ce54 100644
--- a/clang/lib/CodeGen/ABIInfo.cpp
+++ b/clang/lib/CodeGen/ABIInfo.cpp
@@ -184,6 +184,58 @@ ABIArgInfo ABIInfo::getNaturalAlignIndirectInReg(QualType Ty,
                                       /*ByVal*/ false, Realign);
 }
 
+void ABIInfo::appendAttributeMangling(TargetAttr *Attr,
+                                      raw_ostream &Out) const {
+  if (Attr->isDefaultVersion())
+    return;
+  appendAttributeMangling(Attr->getFeaturesStr(), Out);
+}
+
+void ABIInfo::appendAttributeMangling(TargetVersionAttr *Attr,
+                                      raw_ostream &Out) const {
+  appendAttributeMangling(Attr->getNamesStr(), Out);
+}
+
+void ABIInfo::appendAttributeMangling(TargetClonesAttr *Attr, unsigned Index,
+                                      raw_ostream &Out) const {
+  appendAttributeMangling(Attr->getFeatureStr(Index), Out);
+  Out << '.' << Attr->getMangledIndex(Index);
+}
+
+void ABIInfo::appendAttributeMangling(StringRef AttrStr,
+                                      raw_ostream &Out) const {
+  if (AttrStr == "default") {
+    Out << ".default";
+    return;
+  }
+
+  Out << '.';
+  const TargetInfo &TI = CGT.getTarget();
+  ParsedTargetAttr Info = TI.parseTargetAttr(AttrStr);
+
+  llvm::sort(Info.Features, [&TI](StringRef LHS, StringRef RHS) {
+    // Multiversioning doesn't allow "no-${feature}", so we can
+    // only have "+" prefixes here.
+    assert(LHS.starts_with("+") && RHS.starts_with("+") &&
+           "Features should always have a prefix.");
+    return TI.multiVersionSortPriority(LHS.substr(1)) >
+           TI.multiVersionSortPriority(RHS.substr(1));
+  });
+
+  bool IsFirst = true;
+  if (!Info.CPU.empty()) {
+    IsFirst = false;
+    Out << "arch_" << Info.CPU;
+  }
+
+  for (StringRef Feat : Info.Features) {
+    if (!IsFirst)
+      Out << '_';
+    IsFirst = false;
+    Out << Feat.substr(1);
+  }
+}
+
 // Pin the vtable to this file.
 SwiftABIInfo::~SwiftABIInfo() = default;
 
diff --git a/clang/lib/CodeGen/ABIInfo.h b/clang/lib/CodeGen/ABIInfo.h
index b9a5ef6e4366..ff4ae44a42c3 100644
--- a/clang/lib/CodeGen/ABIInfo.h
+++ b/clang/lib/CodeGen/ABIInfo.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_CLANG_LIB_CODEGEN_ABIINFO_H
 #define LLVM_CLANG_LIB_CODEGEN_ABIINFO_H
 
+#include "clang/AST/Attr.h"
 #include "clang/AST/CharUnits.h"
 #include "clang/AST/Type.h"
 #include "llvm/IR/CallingConv.h"
@@ -111,6 +112,15 @@ public:
 
   CodeGen::ABIArgInfo getNaturalAlignIndirectInReg(QualType Ty,
                                                    bool Realign = false) const;
+
+  virtual void appendAttributeMangling(TargetAttr *Attr,
+                                       raw_ostream &Out) const;
+  virtual void appendAttributeMangling(TargetVersionAttr *Attr,
+                                       raw_ostream &Out) const;
+  virtual void appendAttributeMangling(TargetClonesAttr *Attr, unsigned Index,
+                                       raw_ostream &Out) const;
+  virtual void appendAttributeMangling(StringRef AttrStr,
+                                       raw_ostream &Out) const;
 };
 
 /// Target specific hooks for defining how a type should be passed or returned
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 54d7451a9d62..2d16e7cdc060 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -3217,7 +3217,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI__popcnt64:
   case Builtin::BI__builtin_popcount:
   case Builtin::BI__builtin_popcountl:
-  case Builtin::BI__builtin_popcountll: {
+  case Builtin::BI__builtin_popcountll:
+  case Builtin::BI__builtin_popcountg: {
     Value *ArgValue = EmitScalarExpr(E->getArg(0));
 
     llvm::Type *ArgType = ArgValue->getType();
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index d05cf1c6e181..13f68237b464 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -3221,12 +3221,11 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
 
       llvm::StructType *STy =
           dyn_cast<llvm::StructType>(ArgI.getCoerceToType());
-      llvm::TypeSize StructSize;
-      llvm::TypeSize PtrElementSize;
       if (ArgI.isDirect() && !ArgI.getCanBeFlattened() && STy &&
           STy->getNumElements() > 1) {
-        StructSize = CGM.getDataLayout().getTypeAllocSize(STy);
-        PtrElementSize =
+        [[maybe_unused]] llvm::TypeSize StructSize =
+            CGM.getDataLayout().getTypeAllocSize(STy);
+        [[maybe_unused]] llvm::TypeSize PtrElementSize =
             CGM.getDataLayout().getTypeAllocSize(ConvertTypeForMem(Ty));
         if (STy->containsHomogeneousScalableVectorTypes()) {
           assert(StructSize == PtrElementSize &&
@@ -5310,12 +5309,12 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
 
       llvm::StructType *STy =
           dyn_cast<llvm::StructType>(ArgInfo.getCoerceToType());
-      llvm::Type *SrcTy = ConvertTypeForMem(I->Ty);
-      llvm::TypeSize SrcTypeSize;
-      llvm::TypeSize DstTypeSize;
       if (STy && ArgInfo.isDirect() && !ArgInfo.getCanBeFlattened()) {
-        SrcTypeSize = CGM.getDataLayout().getTypeAllocSize(SrcTy);
-        DstTypeSize = CGM.getDataLayout().getTypeAllocSize(STy);
+        llvm::Type *SrcTy = ConvertTypeForMem(I->Ty);
+        [[maybe_unused]] llvm::TypeSize SrcTypeSize =
+            CGM.getDataLayout().getTypeAllocSize(SrcTy);
+        [[maybe_unused]] llvm::TypeSize DstTypeSize =
+            CGM.getDataLayout().getTypeAllocSize(STy);
         if (STy->containsHomogeneousScalableVectorTypes()) {
           assert(SrcTypeSize == DstTypeSize &&
                  "Only allow non-fractional movement of structure with "
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index d0d6202974fe..5190b22bcc16 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -33,6 +33,10 @@ using namespace CodeGen;
 //                        Aggregate Expression Emitter
 //===----------------------------------------------------------------------===//
 
+namespace llvm {
+extern cl::opt<bool> EnableSingleByteCoverage;
+} // namespace llvm
+
 namespace  {
 class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
   CodeGenFunction &CGF;
@@ -1279,7 +1283,10 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
 
   eval.begin(CGF);
   CGF.EmitBlock(LHSBlock);
-  CGF.incrementProfileCounter(E);
+  if (llvm::EnableSingleByteCoverage)
+    CGF.incrementProfileCounter(E->getTrueExpr());
+  else
+    CGF.incrementProfileCounter(E);
   Visit(E->getTrueExpr());
   eval.end(CGF);
 
@@ -1294,6 +1301,8 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
 
   eval.begin(CGF);
   CGF.EmitBlock(RHSBlock);
+  if (llvm::EnableSingleByteCoverage)
+    CGF.incrementProfileCounter(E->getFalseExpr());
   Visit(E->getFalseExpr());
   eval.end(CGF);
 
@@ -1302,6 +1311,8 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
                     E->getType());
 
   CGF.EmitBlock(ContBlock);
+  if (llvm::EnableSingleByteCoverage)
+    CGF.incrementProfileCounter(E);
 }
 
 void AggExprEmitter::VisitChooseExpr(const ChooseExpr *CE) {
diff --git a/clang/lib/CodeGen/CGExprComplex.cpp b/clang/lib/CodeGen/CGExprComplex.cpp
index 176a7e00141f..0266ba934da6 100644
--- a/clang/lib/CodeGen/CGExprComplex.cpp
+++ b/clang/lib/CodeGen/CGExprComplex.cpp
@@ -28,6 +28,10 @@ using namespace CodeGen;
 //                        Complex Expression Emitter
 //===----------------------------------------------------------------------===//
 
+namespace llvm {
+extern cl::opt<bool> EnableSingleByteCoverage;
+} // namespace llvm
+
 typedef CodeGenFunction::ComplexPairTy ComplexPairTy;
 
 /// Return the complex type that we are meant to emit.
@@ -1330,7 +1334,11 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
 
   eval.begin(CGF);
   CGF.EmitBlock(LHSBlock);
-  CGF.incrementProfileCounter(E);
+  if (llvm::EnableSingleByteCoverage)
+    CGF.incrementProfileCounter(E->getTrueExpr());
+  else
+    CGF.incrementProfileCounter(E);
+
   ComplexPairTy LHS = Visit(E->getTrueExpr());
   LHSBlock = Builder.GetInsertBlock();
   CGF.EmitBranch(ContBlock);
@@ -1338,9 +1346,13 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
 
   eval.begin(CGF);
   CGF.EmitBlock(RHSBlock);
+  if (llvm::EnableSingleByteCoverage)
+    CGF.incrementProfileCounter(E->getFalseExpr());
   ComplexPairTy RHS = Visit(E->getFalseExpr());
   RHSBlock = Builder.GetInsertBlock();
   CGF.EmitBlock(ContBlock);
+  if (llvm::EnableSingleByteCoverage)
+    CGF.incrementProfileCounter(E);
   eval.end(CGF);
 
   // Create a PHI node for the real part.
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 10b745752204..8536570087ad 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -52,6 +52,10 @@ using llvm::Value;
 //                         Scalar Expression Emitter
 //===----------------------------------------------------------------------===//
 
+namespace llvm {
+extern cl::opt<bool> EnableSingleByteCoverage;
+} // namespace llvm
+
 namespace {
 
 /// Determine whether the given binary operation may overflow.
@@ -4925,8 +4929,13 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
 
     // If the dead side doesn't have labels we need, just emit the Live part.
     if (!CGF.ContainsLabel(dead)) {
-      if (CondExprBool)
+      if (CondExprBool) {
+        if (llvm::EnableSingleByteCoverage) {
+          CGF.incrementProfileCounter(lhsExpr);
+          CGF.incrementProfileCounter(rhsExpr);
+        }
         CGF.incrementProfileCounter(E);
+      }
       Value *Result = Visit(live);
 
       // If the live part is a throw expression, it acts like it has a void
@@ -5005,7 +5014,12 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
     llvm::Value *CondV = CGF.EvaluateExprAsBool(condExpr);
     llvm::Value *StepV = Builder.CreateZExtOrBitCast(CondV, CGF.Int64Ty);
 
-    CGF.incrementProfileCounter(E, StepV);
+    if (llvm::EnableSingleByteCoverage) {
+      CGF.incrementProfileCounter(lhsExpr);
+      CGF.incrementProfileCounter(rhsExpr);
+      CGF.incrementProfileCounter(E);
+    } else
+      CGF.incrementProfileCounter(E, StepV);
 
     llvm::Value *LHS = Visit(lhsExpr);
     llvm::Value *RHS = Visit(rhsExpr);
@@ -5037,7 +5051,11 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
   if (CGF.MCDCLogOpStack.empty())
     CGF.maybeUpdateMCDCTestVectorBitmap(condExpr);
 
-  CGF.incrementProfileCounter(E);
+  if (llvm::EnableSingleByteCoverage)
+    CGF.incrementProfileCounter(lhsExpr);
+  else
+    CGF.incrementProfileCounter(E);
+
   eval.begin(CGF);
   Value *LHS = Visit(lhsExpr);
   eval.end(CGF);
@@ -5053,6 +5071,9 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
   if (CGF.MCDCLogOpStack.empty())
     CGF.maybeUpdateMCDCTestVectorBitmap(condExpr);
 
+  if (llvm::EnableSingleByteCoverage)
+    CGF.incrementProfileCounter(rhsExpr);
+
   eval.begin(CGF);
   Value *RHS = Visit(rhsExpr);
   eval.end(CGF);
@@ -5071,6 +5092,11 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
   PN->addIncoming(LHS, LHSBlock);
   PN->addIncoming(RHS, RHSBlock);
 
+  // When single byte coverage mode is enabled, add a counter to continuation
+  // block.
+  if (llvm::EnableSingleByteCoverage)
+    CGF.incrementProfileCounter(E);
+
   return PN;
 }
 
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index af51875782c9..d0a3a716ad75 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -43,6 +43,10 @@ using namespace CodeGen;
 //                              Statement Emission
 //===----------------------------------------------------------------------===//
 
+namespace llvm {
+extern cl::opt<bool> EnableSingleByteCoverage;
+} // namespace llvm
+
 void CodeGenFunction::EmitStopPoint(const Stmt *S) {
   if (CGDebugInfo *DI = getDebugInfo()) {
     SourceLocation Loc;
@@ -856,7 +860,10 @@ void CodeGenFunction::EmitIfStmt(const IfStmt &S) {
 
   // Emit the 'then' code.
   EmitBlock(ThenBlock);
-  incrementProfileCounter(&S);
+  if (llvm::EnableSingleByteCoverage)
+    incrementProfileCounter(S.getThen());
+  else
+    incrementProfileCounter(&S);
   {
     RunCleanupsScope ThenScope(*this);
     EmitStmt(S.getThen());
@@ -870,6 +877,9 @@ void CodeGenFunction::EmitIfStmt(const IfStmt &S) {
       auto NL = ApplyDebugLocation::CreateEmpty(*this);
       EmitBlock(ElseBlock);
     }
+    // When single byte coverage mode is enabled, add a counter to else block.
+    if (llvm::EnableSingleByteCoverage)
+      incrementProfileCounter(Else);
     {
       RunCleanupsScope ElseScope(*this);
       EmitStmt(Else);
@@ -883,6 +893,11 @@ void CodeGenFunction::EmitIfStmt(const IfStmt &S) {
 
   // Emit the continuation block for code after the if.
   EmitBlock(ContBlock, true);
+
+  // When single byte coverage mode is enabled, add a counter to continuation
+  // block.
+  if (llvm::EnableSingleByteCoverage)
+    incrementProfileCounter(&S);
 }
 
 void CodeGenFunction::EmitWhileStmt(const WhileStmt &S,
@@ -927,6 +942,10 @@ void CodeGenFunction::EmitWhileStmt(const WhileStmt &S,
                  SourceLocToDebugLoc(R.getEnd()),
                  checkIfLoopMustProgress(CondIsConstInt));
 
+  // When single byte coverage mode is enabled, add a counter to loop condition.
+  if (llvm::EnableSingleByteCoverage)
+    incrementProfileCounter(S.getCond());
+
   // As long as the condition is true, go to the loop body.
   llvm::BasicBlock *LoopBody = createBasicBlock("while.body");
   if (EmitBoolCondBranch) {
@@ -959,7 +978,11 @@ void CodeGenFunction::EmitWhileStmt(const WhileStmt &S,
   {
     RunCleanupsScope BodyScope(*this);
     EmitBlock(LoopBody);
-    incrementProfileCounter(&S);
+    // When single byte coverage mode is enabled, add a counter to the body.
+    if (llvm::EnableSingleByteCoverage)
+      incrementProfileCounter(S.getBody());
+    else
+      incrementProfileCounter(&S);
     EmitStmt(S.getBody());
   }
 
@@ -981,6 +1004,11 @@ void CodeGenFunction::EmitWhileStmt(const WhileStmt &S,
   // a branch, try to erase it.
   if (!EmitBoolCondBranch)
     SimplifyForwardingBlocks(LoopHeader.getBlock());
+
+  // When single byte coverage mode is enabled, add a counter to continuation
+  // block.
+  if (llvm::EnableSingleByteCoverage)
+    incrementProfileCounter(&S);
 }
 
 void CodeGenFunction::EmitDoStmt(const DoStmt &S,
@@ -996,13 +1024,19 @@ void CodeGenFunction::EmitDoStmt(const DoStmt &S,
   // Emit the body of the loop.
   llvm::BasicBlock *LoopBody = createBasicBlock("do.body");
 
-  EmitBlockWithFallThrough(LoopBody, &S);
+  if (llvm::EnableSingleByteCoverage)
+    EmitBlockWithFallThrough(LoopBody, S.getBody());
+  else
+    EmitBlockWithFallThrough(LoopBody, &S);
   {
     RunCleanupsScope BodyScope(*this);
     EmitStmt(S.getBody());
   }
 
   EmitBlock(LoopCond.getBlock());
+  // When single byte coverage mode is enabled, add a counter to loop condition.
+  if (llvm::EnableSingleByteCoverage)
+    incrementProfileCounter(S.getCond());
 
   // C99 6.8.5.2: "The evaluation of the controlling expression takes place
   // after each execution of the loop body."
@@ -1043,6 +1077,11 @@ void CodeGenFunction::EmitDoStmt(const DoStmt &S,
   // emitting a branch, try to erase it.
   if (!EmitBoolCondBranch)
     SimplifyForwardingBlocks(LoopCond.getBlock());
+
+  // When single byte coverage mode is enabled, add a counter to continuation
+  // block.
+  if (llvm::EnableSingleByteCoverage)
+    incrementProfileCounter(&S);
 }
 
 void CodeGenFunction::EmitForStmt(const ForStmt &S,
@@ -1101,6 +1140,11 @@ void CodeGenFunction::EmitForStmt(const ForStmt &S,
       BreakContinueStack.back().ContinueBlock = Continue;
     }
 
+    // When single byte coverage mode is enabled, add a counter to loop
+    // condition.
+    if (llvm::EnableSingleByteCoverage)
+      incrementProfileCounter(S.getCond());
+
     llvm::BasicBlock *ExitBlock = LoopExit.getBlock();
     // If there are any cleanups between here and the loop-exit scope,
     // create a block to stage a loop exit along.
@@ -1131,8 +1175,12 @@ void CodeGenFunction::EmitForStmt(const ForStmt &S,
     // Treat it as a non-zero constant.  Don't even create a new block for the
     // body, just fall into it.
   }
-  incrementProfileCounter(&S);
 
+  // When single byte coverage mode is enabled, add a counter to the body.
+  if (llvm::EnableSingleByteCoverage)
+    incrementProfileCounter(S.getBody());
+  else
+    incrementProfileCounter(&S);
   {
     // Create a separate cleanup scope for the body, in case it is not
     // a compound statement.
@@ -1144,6 +1192,8 @@ void CodeGenFunction::EmitForStmt(const ForStmt &S,
   if (S.getInc()) {
     EmitBlock(Continue.getBlock());
     EmitStmt(S.getInc());
+    if (llvm::EnableSingleByteCoverage)
+      incrementProfileCounter(S.getInc());
   }
 
   BreakContinueStack.pop_back();
@@ -1159,6 +1209,11 @@ void CodeGenFunction::EmitForStmt(const ForStmt &S,
 
   // Emit the fall-through block.
   EmitBlock(LoopExit.getBlock(), true);
+
+  // When single byte coverage mode is enabled, add a counter to continuation
+  // block.
+  if (llvm::EnableSingleByteCoverage)
+    incrementProfileCounter(&S);
 }
 
 void
@@ -1211,7 +1266,10 @@ CodeGenFunction::EmitCXXForRangeStmt(const CXXForRangeStmt &S,
   }
 
   EmitBlock(ForBody);
-  incrementProfileCounter(&S);
+  if (llvm::EnableSingleByteCoverage)
+    incrementProfileCounter(S.getBody());
+  else
+    incrementProfileCounter(&S);
 
   // Create a block for the increment. In case of a 'continue', we jump there.
   JumpDest Continue = getJumpDestInCurrentScope("for.inc");
@@ -1241,6 +1299,11 @@ CodeGenFunction::EmitCXXForRangeStmt(const CXXForRangeStmt &S,
 
   // Emit the fall-through block.
   EmitBlock(LoopExit.getBlock(), true);
+
+  // When single byte coverage mode is enabled, add a counter to continuation
+  // block.
+  if (llvm::EnableSingleByteCoverage)
+    incrementProfileCounter(&S);
 }
 
 void CodeGenFunction::EmitReturnOfRValue(RValue RV, QualType Ty) {
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 1ad905078d34..b87fc86f4e63 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -52,6 +52,10 @@
 using namespace clang;
 using namespace CodeGen;
 
+namespace llvm {
+extern cl::opt<bool> EnableSingleByteCoverage;
+} // namespace llvm
+
 /// shouldEmitLifetimeMarkers - Decide whether we need emit the life-time
 /// markers.
 static bool shouldEmitLifetimeMarkers(const CodeGenOptions &CGOpts,
@@ -1270,7 +1274,10 @@ void CodeGenFunction::EmitFunctionBody(const Stmt *Body) {
 void CodeGenFunction::EmitBlockWithFallThrough(llvm::BasicBlock *BB,
                                                const Stmt *S) {
   llvm::BasicBlock *SkipCountBB = nullptr;
-  if (HaveInsertPoint() && CGM.getCodeGenOpts().hasProfileClangInstr()) {
+  // Do not skip over the instrumentation when single byte coverage mode is
+  // enabled.
+  if (HaveInsertPoint() && CGM.getCodeGenOpts().hasProfileClangInstr() &&
+      !llvm::EnableSingleByteCoverage) {
     // When instrumenting for profiling, the fallthrough to certain
     // statements needs to skip over the instrumentation code so that we
     // get an accurate count.
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index b2800f699ff4..06327a184717 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -1545,7 +1545,7 @@ public:
     if (CGM.getCodeGenOpts().hasProfileClangInstr() &&
         !CurFn->hasFnAttribute(llvm::Attribute::NoProfile) &&
         !CurFn->hasFnAttribute(llvm::Attribute::SkipProfile))
-      PGO.emitCounterIncrement(Builder, S, StepV);
+      PGO.emitCounterSetOrIncrement(Builder, S, StepV);
     PGO.setCurrentStmt(S);
   }
 
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 95e457bef28e..82a97ecfaa00 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -397,8 +397,8 @@ CodeGenModule::CodeGenModule(ASTContext &C,
   // Enable TBAA unless it's suppressed. ThreadSanitizer needs TBAA even at O0.
   if (LangOpts.Sanitize.has(SanitizerKind::Thread) ||
       (!CodeGenOpts.RelaxedAliasing && CodeGenOpts.OptimizationLevel > 0))
-    TBAA.reset(new CodeGenTBAA(Context, TheModule, CodeGenOpts, getLangOpts(),
-                               getCXXABI().getMangleContext()));
+    TBAA.reset(new CodeGenTBAA(Context, getTypes(), TheModule, CodeGenOpts,
+                               getLangOpts(), getCXXABI().getMangleContext()));
 
   // If debug info or coverage generation is enabled, create the CGDebugInfo
   // object.
@@ -858,6 +858,7 @@ void CodeGenModule::Release() {
   checkAliases();
   EmitDeferredUnusedCoverageMappings();
   CodeGenPGO(*this).setValueProfilingFlag(getModule());
+  CodeGenPGO(*this).setProfileVersion(getModule());
   if (CoverageMapping)
     CoverageMapping->emit();
   if (CodeGenOpts.SanitizeCfiCrossDso) {
@@ -1726,59 +1727,6 @@ static void AppendCPUSpecificCPUDispatchMangling(const CodeGenModule &CGM,
     Out << ".resolver";
 }
 
-static void AppendTargetVersionMangling(const CodeGenModule &CGM,
-                                        const TargetVersionAttr *Attr,
-                                        raw_ostream &Out) {
-  if (Attr->isDefaultVersion()) {
-    Out << ".default";
-    return;
-  }
-  Out << "._";
-  const TargetInfo &TI = CGM.getTarget();
-  llvm::SmallVector<StringRef, 8> Feats;
-  Attr->getFeatures(Feats);
-  llvm::stable_sort(Feats, [&TI](const StringRef FeatL, const StringRef FeatR) {
-    return TI.multiVersionSortPriority(FeatL) <
-           TI.multiVersionSortPriority(FeatR);
-  });
-  for (const auto &Feat : Feats) {
-    Out << 'M';
-    Out << Feat;
-  }
-}
-
-static void AppendTargetMangling(const CodeGenModule &CGM,
-                                 const TargetAttr *Attr, raw_ostream &Out) {
-  if (Attr->isDefaultVersion())
-    return;
-
-  Out << '.';
-  const TargetInfo &Target = CGM.getTarget();
-  ParsedTargetAttr Info = Target.parseTargetAttr(Attr->getFeaturesStr());
-  llvm::sort(Info.Features, [&Target](StringRef LHS, StringRef RHS) {
-    // Multiversioning doesn't allow "no-${feature}", so we can
-    // only have "+" prefixes here.
-    assert(LHS.starts_with("+") && RHS.starts_with("+") &&
-           "Features should always have a prefix.");
-    return Target.multiVersionSortPriority(LHS.substr(1)) >
-           Target.multiVersionSortPriority(RHS.substr(1));
-  });
-
-  bool IsFirst = true;
-
-  if (!Info.CPU.empty()) {
-    IsFirst = false;
-    Out << "arch_" << Info.CPU;
-  }
-
-  for (StringRef Feat : Info.Features) {
-    if (!IsFirst)
-      Out << '_';
-    IsFirst = false;
-    Out << Feat.substr(1);
-  }
-}
-
 // Returns true if GD is a function decl with internal linkage and
 // needs a unique suffix after the mangled name.
 static bool isUniqueInternalLinkageDecl(GlobalDecl GD,
@@ -1788,41 +1736,6 @@ static bool isUniqueInternalLinkageDecl(GlobalDecl GD,
          (CGM.getFunctionLinkage(GD) == llvm::GlobalValue::InternalLinkage);
 }
 
-static void AppendTargetClonesMangling(const CodeGenModule &CGM,
-                                       const TargetClonesAttr *Attr,
-                                       unsigned VersionIndex,
-                                       raw_ostream &Out) {
-  const TargetInfo &TI = CGM.getTarget();
-  if (TI.getTriple().isAArch64()) {
-    StringRef FeatureStr = Attr->getFeatureStr(VersionIndex);
-    if (FeatureStr == "default") {
-      Out << ".default";
-      return;
-    }
-    Out << "._";
-    SmallVector<StringRef, 8> Features;
-    FeatureStr.split(Features, "+");
-    llvm::stable_sort(Features,
-                      [&TI](const StringRef FeatL, const StringRef FeatR) {
-                        return TI.multiVersionSortPriority(FeatL) <
-                               TI.multiVersionSortPriority(FeatR);
-                      });
-    for (auto &Feat : Features) {
-      Out << 'M';
-      Out << Feat;
-    }
-  } else {
-    Out << '.';
-    StringRef FeatureStr = Attr->getFeatureStr(VersionIndex);
-    if (FeatureStr.starts_with("arch="))
-      Out << "arch_" << FeatureStr.substr(sizeof("arch=") - 1);
-    else
-      Out << FeatureStr;
-
-    Out << '.' << Attr->getMangledIndex(VersionIndex);
-  }
-}
-
 static std::string getMangledNameImpl(CodeGenModule &CGM, GlobalDecl GD,
                                       const NamedDecl *ND,
                                       bool OmitMultiVersionMangling = false) {
@@ -1876,16 +1789,25 @@ static std::string getMangledNameImpl(CodeGenModule &CGM, GlobalDecl GD,
                                              FD->getAttr<CPUSpecificAttr>(),
                                              GD.getMultiVersionIndex(), Out);
         break;
-      case MultiVersionKind::Target:
-        AppendTargetMangling(CGM, FD->getAttr<TargetAttr>(), Out);
+      case MultiVersionKind::Target: {
+        auto *Attr = FD->getAttr<TargetAttr>();
+        const ABIInfo &Info = CGM.getTargetCodeGenInfo().getABIInfo();
+        Info.appendAttributeMangling(Attr, Out);
         break;
-      case MultiVersionKind::TargetVersion:
-        AppendTargetVersionMangling(CGM, FD->getAttr<TargetVersionAttr>(), Out);
+      }
+      case MultiVersionKind::TargetVersion: {
+        auto *Attr = FD->getAttr<TargetVersionAttr>();
+        const ABIInfo &Info = CGM.getTargetCodeGenInfo().getABIInfo();
+        Info.appendAttributeMangling(Attr, Out);
         break;
-      case MultiVersionKind::TargetClones:
-        AppendTargetClonesMangling(CGM, FD->getAttr<TargetClonesAttr>(),
-                                   GD.getMultiVersionIndex(), Out);
+      }
+      case MultiVersionKind::TargetClones: {
+        auto *Attr = FD->getAttr<TargetClonesAttr>();
+        unsigned Index = GD.getMultiVersionIndex();
+        const ABIInfo &Info = CGM.getTargetCodeGenInfo().getABIInfo();
+        Info.appendAttributeMangling(Attr, Index, Out);
         break;
+      }
       case MultiVersionKind::None:
         llvm_unreachable("None multiversion type isn't valid here");
       }
diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index 8aebd3557690..2619edfeb7dc 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -23,6 +23,10 @@
 #include "llvm/Support/MD5.h"
 #include <optional>
 
+namespace llvm {
+extern cl::opt<bool> EnableSingleByteCoverage;
+} // namespace llvm
+
 static llvm::cl::opt<bool>
     EnableValueProfiling("enable-value-profiling",
                          llvm::cl::desc("Enable value profiling"),
@@ -346,6 +350,14 @@ struct MapRegionCounters : public RecursiveASTVisitor<MapRegionCounters> {
     return Base::VisitBinaryOperator(S);
   }
 
+  bool VisitConditionalOperator(ConditionalOperator *S) {
+    if (llvm::EnableSingleByteCoverage && S->getTrueExpr())
+      CounterMap[S->getTrueExpr()] = NextCounter++;
+    if (llvm::EnableSingleByteCoverage && S->getFalseExpr())
+      CounterMap[S->getFalseExpr()] = NextCounter++;
+    return Base::VisitConditionalOperator(S);
+  }
+
   /// Include \p S in the function hash.
   bool VisitStmt(Stmt *S) {
     auto Type = updateCounterMappings(S);
@@ -361,8 +373,21 @@ struct MapRegionCounters : public RecursiveASTVisitor<MapRegionCounters> {
     if (Hash.getHashVersion() == PGO_HASH_V1)
       return Base::TraverseIfStmt(If);
 
+    // When single byte coverage mode is enabled, add a counter to then and
+    // else.
+    bool NoSingleByteCoverage = !llvm::EnableSingleByteCoverage;
+    for (Stmt *CS : If->children()) {
+      if (!CS || NoSingleByteCoverage)
+        continue;
+      if (CS == If->getThen())
+        CounterMap[If->getThen()] = NextCounter++;
+      else if (CS == If->getElse())
+        CounterMap[If->getElse()] = NextCounter++;
+    }
+
     // Otherwise, keep track of which branch we're in while traversing.
     VisitStmt(If);
+
     for (Stmt *CS : If->children()) {
       if (!CS)
         continue;
@@ -376,6 +401,81 @@ struct MapRegionCounters : public RecursiveASTVisitor<MapRegionCounters> {
     return true;
   }
 
+  bool TraverseWhileStmt(WhileStmt *While) {
+    // When single byte coverage mode is enabled, add a counter to condition and
+    // body.
+    bool NoSingleByteCoverage = !llvm::EnableSingleByteCoverage;
+    for (Stmt *CS : While->children()) {
+      if (!CS || NoSingleByteCoverage)
+        continue;
+      if (CS == While->getCond())
+        CounterMap[While->getCond()] = NextCounter++;
+      else if (CS == While->getBody())
+        CounterMap[While->getBody()] = NextCounter++;
+    }
+
+    Base::TraverseWhileStmt(While);
+    if (Hash.getHashVersion() != PGO_HASH_V1)
+      Hash.combine(PGOHash::EndOfScope);
+    return true;
+  }
+
+  bool TraverseDoStmt(DoStmt *Do) {
+    // When single byte coverage mode is enabled, add a counter to condition and
+    // body.
+    bool NoSingleByteCoverage = !llvm::EnableSingleByteCoverage;
+    for (Stmt *CS : Do->children()) {
+      if (!CS || NoSingleByteCoverage)
+        continue;
+      if (CS == Do->getCond())
+        CounterMap[Do->getCond()] = NextCounter++;
+      else if (CS == Do->getBody())
+        CounterMap[Do->getBody()] = NextCounter++;
+    }
+
+    Base::TraverseDoStmt(Do);
+    if (Hash.getHashVersion() != PGO_HASH_V1)
+      Hash.combine(PGOHash::EndOfScope);
+    return true;
+  }
+
+  bool TraverseForStmt(ForStmt *For) {
+    // When single byte coverage mode is enabled, add a counter to condition,
+    // increment and body.
+    bool NoSingleByteCoverage = !llvm::EnableSingleByteCoverage;
+    for (Stmt *CS : For->children()) {
+      if (!CS || NoSingleByteCoverage)
+        continue;
+      if (CS == For->getCond())
+        CounterMap[For->getCond()] = NextCounter++;
+      else if (CS == For->getInc())
+        CounterMap[For->getInc()] = NextCounter++;
+      else if (CS == For->getBody())
+        CounterMap[For->getBody()] = NextCounter++;
+    }
+
+    Base::TraverseForStmt(For);
+    if (Hash.getHashVersion() != PGO_HASH_V1)
+      Hash.combine(PGOHash::EndOfScope);
+    return true;
+  }
+
+  bool TraverseCXXForRangeStmt(CXXForRangeStmt *ForRange) {
+    // When single byte coverage mode is enabled, add a counter to body.
+    bool NoSingleByteCoverage = !llvm::EnableSingleByteCoverage;
+    for (Stmt *CS : ForRange->children()) {
+      if (!CS || NoSingleByteCoverage)
+        continue;
+      if (CS == ForRange->getBody())
+        CounterMap[ForRange->getBody()] = NextCounter++;
+    }
+
+    Base::TraverseCXXForRangeStmt(ForRange);
+    if (Hash.getHashVersion() != PGO_HASH_V1)
+      Hash.combine(PGOHash::EndOfScope);
+    return true;
+  }
+
 // If the statement type \p N is nestable, and its nesting impacts profile
 // stability, define a custom traversal which tracks the end of the statement
 // in the hash (provided we're not using the V1 hash).
@@ -387,10 +487,6 @@ struct MapRegionCounters : public RecursiveASTVisitor<MapRegionCounters> {
     return true;                                                               \
   }
 
-  DEFINE_NESTABLE_TRAVERSAL(WhileStmt)
-  DEFINE_NESTABLE_TRAVERSAL(DoStmt)
-  DEFINE_NESTABLE_TRAVERSAL(ForStmt)
-  DEFINE_NESTABLE_TRAVERSAL(CXXForRangeStmt)
   DEFINE_NESTABLE_TRAVERSAL(ObjCForCollectionStmt)
   DEFINE_NESTABLE_TRAVERSAL(CXXTryStmt)
   DEFINE_NESTABLE_TRAVERSAL(CXXCatchStmt)
@@ -1094,8 +1190,8 @@ CodeGenPGO::applyFunctionAttributes(llvm::IndexedInstrProfReader *PGOReader,
   Fn->setEntryCount(FunctionCount);
 }
 
-void CodeGenPGO::emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S,
-                                      llvm::Value *StepV) {
+void CodeGenPGO::emitCounterSetOrIncrement(CGBuilderTy &Builder, const Stmt *S,
+                                           llvm::Value *StepV) {
   if (!RegionCounterMap || !Builder.GetInsertBlock())
     return;
 
@@ -1105,13 +1201,19 @@ void CodeGenPGO::emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S,
                          Builder.getInt64(FunctionHash),
                          Builder.getInt32(NumRegionCounters),
                          Builder.getInt32(Counter), StepV};
-  if (!StepV)
-    Builder.CreateCall(CGM.getIntrinsic(llvm::Intrinsic::instrprof_increment),
+
+  if (llvm::EnableSingleByteCoverage)
+    Builder.CreateCall(CGM.getIntrinsic(llvm::Intrinsic::instrprof_cover),
                        ArrayRef(Args, 4));
-  else
-    Builder.CreateCall(
-        CGM.getIntrinsic(llvm::Intrinsic::instrprof_increment_step),
-        ArrayRef(Args));
+  else {
+    if (!StepV)
+      Builder.CreateCall(CGM.getIntrinsic(llvm::Intrinsic::instrprof_increment),
+                         ArrayRef(Args, 4));
+    else
+      Builder.CreateCall(
+          CGM.getIntrinsic(llvm::Intrinsic::instrprof_increment_step),
+          ArrayRef(Args));
+  }
 }
 
 bool CodeGenPGO::canEmitMCDCCoverage(const CGBuilderTy &Builder) {
@@ -1222,6 +1324,30 @@ void CodeGenPGO::setValueProfilingFlag(llvm::Module &M) {
                     uint32_t(EnableValueProfiling));
 }
 
+void CodeGenPGO::setProfileVersion(llvm::Module &M) {
+  if (CGM.getCodeGenOpts().hasProfileClangInstr() &&
+      llvm::EnableSingleByteCoverage) {
+    const StringRef VarName(INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR));
+    llvm::Type *IntTy64 = llvm::Type::getInt64Ty(M.getContext());
+    uint64_t ProfileVersion =
+        (INSTR_PROF_RAW_VERSION | VARIANT_MASK_BYTE_COVERAGE);
+
+    auto IRLevelVersionVariable = new llvm::GlobalVariable(
+        M, IntTy64, true, llvm::GlobalValue::WeakAnyLinkage,
+        llvm::Constant::getIntegerValue(IntTy64,
+                                        llvm::APInt(64, ProfileVersion)),
+        VarName);
+
+    IRLevelVersionVariable->setVisibility(llvm::GlobalValue::DefaultVisibility);
+    llvm::Triple TT(M.getTargetTriple());
+    if (TT.supportsCOMDAT()) {
+      IRLevelVersionVariable->setLinkage(llvm::GlobalValue::ExternalLinkage);
+      IRLevelVersionVariable->setComdat(M.getOrInsertComdat(VarName));
+    }
+    IRLevelVersionVariable->setDSOLocal(true);
+  }
+}
+
 // This method either inserts a call to the profile run-time during
 // instrumentation or puts profile data into metadata for PGO use.
 void CodeGenPGO::valueProfile(CGBuilderTy &Builder, uint32_t ValueKind,
diff --git a/clang/lib/CodeGen/CodeGenPGO.h b/clang/lib/CodeGen/CodeGenPGO.h
index d3c2b277238f..036fbf6815a4 100644
--- a/clang/lib/CodeGen/CodeGenPGO.h
+++ b/clang/lib/CodeGen/CodeGenPGO.h
@@ -94,6 +94,8 @@ public:
   // Set a module flag indicating if value profiling is enabled.
   void setValueProfilingFlag(llvm::Module &M);
 
+  void setProfileVersion(llvm::Module &M);
+
 private:
   void setFuncName(llvm::Function *Fn);
   void setFuncName(StringRef Name, llvm::GlobalValue::LinkageTypes Linkage);
@@ -108,8 +110,8 @@ private:
   bool canEmitMCDCCoverage(const CGBuilderTy &Builder);
 
 public:
-  void emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S,
-                            llvm::Value *StepV);
+  void emitCounterSetOrIncrement(CGBuilderTy &Builder, const Stmt *S,
+                                 llvm::Value *StepV);
   void emitMCDCTestVectorBitmapUpdate(CGBuilderTy &Builder, const Expr *S,
                                       Address MCDCCondBitmapAddr);
   void emitMCDCParameters(CGBuilderTy &Builder);
diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp b/clang/lib/CodeGen/CodeGenTBAA.cpp
index dc288bc3f615..8a0816121939 100644
--- a/clang/lib/CodeGen/CodeGenTBAA.cpp
+++ b/clang/lib/CodeGen/CodeGenTBAA.cpp
@@ -15,6 +15,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeGenTBAA.h"
+#include "CGRecordLayout.h"
+#include "CodeGenTypes.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/Mangle.h"
@@ -26,16 +28,16 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
 using namespace clang;
 using namespace CodeGen;
 
-CodeGenTBAA::CodeGenTBAA(ASTContext &Ctx, llvm::Module &M,
-                         const CodeGenOptions &CGO,
+CodeGenTBAA::CodeGenTBAA(ASTContext &Ctx, CodeGenTypes &CGTypes,
+                         llvm::Module &M, const CodeGenOptions &CGO,
                          const LangOptions &Features, MangleContext &MContext)
-  : Context(Ctx), Module(M), CodeGenOpts(CGO),
-    Features(Features), MContext(MContext), MDHelper(M.getContext()),
-    Root(nullptr), Char(nullptr)
-{}
+    : Context(Ctx), CGTypes(CGTypes), Module(M), CodeGenOpts(CGO),
+      Features(Features), MContext(MContext), MDHelper(M.getContext()),
+      Root(nullptr), Char(nullptr) {}
 
 CodeGenTBAA::~CodeGenTBAA() {
 }
@@ -294,14 +296,34 @@ CodeGenTBAA::CollectFields(uint64_t BaseOffset,
         return false;
 
     const ASTRecordLayout &Layout = Context.getASTRecordLayout(RD);
+    const CGRecordLayout &CGRL = CGTypes.getCGRecordLayout(RD);
 
     unsigned idx = 0;
-    for (RecordDecl::field_iterator i = RD->field_begin(),
-         e = RD->field_end(); i != e; ++i, ++idx) {
-      if ((*i)->isZeroSize(Context) || (*i)->isUnnamedBitfield())
+    for (RecordDecl::field_iterator i = RD->field_begin(), e = RD->field_end();
+         i != e; ++i, ++idx) {
+      if ((*i)->isZeroSize(Context))
         continue;
-      uint64_t Offset = BaseOffset +
-                        Layout.getFieldOffset(idx) / Context.getCharWidth();
+
+      uint64_t Offset =
+          BaseOffset + Layout.getFieldOffset(idx) / Context.getCharWidth();
+
+      // Create a single field for consecutive named bitfields using char as
+      // base type.
+      if ((*i)->isBitField()) {
+        const CGBitFieldInfo &Info = CGRL.getBitFieldInfo(*i);
+        if (Info.Offset != 0)
+          continue;
+        unsigned CurrentBitFieldSize = Info.StorageSize;
+        uint64_t Size =
+            llvm::divideCeil(CurrentBitFieldSize, Context.getCharWidth());
+        llvm::MDNode *TBAAType = getChar();
+        llvm::MDNode *TBAATag =
+            getAccessTagInfo(TBAAAccessInfo(TBAAType, Size));
+        Fields.push_back(
+            llvm::MDBuilder::TBAAStructField(Offset, Size, TBAATag));
+        continue;
+      }
+
       QualType FieldQTy = i->getType();
       if (!CollectFields(Offset, FieldQTy, Fields,
                          MayAlias || TypeHasMayAlias(FieldQTy)))
diff --git a/clang/lib/CodeGen/CodeGenTBAA.h b/clang/lib/CodeGen/CodeGenTBAA.h
index a65963596fe9..aa6da2731a41 100644
--- a/clang/lib/CodeGen/CodeGenTBAA.h
+++ b/clang/lib/CodeGen/CodeGenTBAA.h
@@ -29,6 +29,7 @@ namespace clang {
   class Type;
 
 namespace CodeGen {
+class CodeGenTypes;
 
 // TBAAAccessKind - A kind of TBAA memory access descriptor.
 enum class TBAAAccessKind : unsigned {
@@ -115,6 +116,7 @@ struct TBAAAccessInfo {
 /// while lowering AST types to LLVM types.
 class CodeGenTBAA {
   ASTContext &Context;
+  CodeGenTypes &CGTypes;
   llvm::Module &Module;
   const CodeGenOptions &CodeGenOpts;
   const LangOptions &Features;
@@ -167,8 +169,9 @@ class CodeGenTBAA {
   llvm::MDNode *getBaseTypeInfoHelper(const Type *Ty);
 
 public:
-  CodeGenTBAA(ASTContext &Ctx, llvm::Module &M, const CodeGenOptions &CGO,
-              const LangOptions &Features, MangleContext &MContext);
+  CodeGenTBAA(ASTContext &Ctx, CodeGenTypes &CGTypes, llvm::Module &M,
+              const CodeGenOptions &CGO, const LangOptions &Features,
+              MangleContext &MContext);
   ~CodeGenTBAA();
 
   /// getTypeInfo - Get metadata used to describe accesses to objects of the
diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index e25a92758f32..71215da362d3 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -31,6 +31,14 @@
 // is textually included.
 #define COVMAP_V3
 
+namespace llvm {
+cl::opt<bool>
+    EnableSingleByteCoverage("enable-single-byte-coverage",
+                             llvm::cl::ZeroOrMore,
+                             llvm::cl::desc("Enable single byte coverage"),
+                             llvm::cl::Hidden, llvm::cl::init(false));
+} // namespace llvm
+
 static llvm::cl::opt<bool> EmptyLineCommentCoverage(
     "emptyline-comment-coverage",
     llvm::cl::desc("Emit emptylines and comment lines as skipped regions (only "
@@ -832,16 +840,22 @@ struct CounterCoverageMappingBuilder
 
   /// Return a counter for the subtraction of \c RHS from \c LHS
   Counter subtractCounters(Counter LHS, Counter RHS, bool Simplify = true) {
+    assert(!llvm::EnableSingleByteCoverage &&
+           "cannot add counters when single byte coverage mode is enabled");
     return Builder.subtract(LHS, RHS, Simplify);
   }
 
   /// Return a counter for the sum of \c LHS and \c RHS.
   Counter addCounters(Counter LHS, Counter RHS, bool Simplify = true) {
+    assert(!llvm::EnableSingleByteCoverage &&
+           "cannot add counters when single byte coverage mode is enabled");
     return Builder.add(LHS, RHS, Simplify);
   }
 
   Counter addCounters(Counter C1, Counter C2, Counter C3,
                       bool Simplify = true) {
+    assert(!llvm::EnableSingleByteCoverage &&
+           "cannot add counters when single byte coverage mode is enabled");
     return addCounters(addCounters(C1, C2, Simplify), C3, Simplify);
   }
 
@@ -1443,8 +1457,9 @@ struct CounterCoverageMappingBuilder
 
   void VisitBreakStmt(const BreakStmt *S) {
     assert(!BreakContinueStack.empty() && "break not in a loop or switch!");
-    BreakContinueStack.back().BreakCount = addCounters(
-        BreakContinueStack.back().BreakCount, getRegion().getCounter());
+    if (!llvm::EnableSingleByteCoverage)
+      BreakContinueStack.back().BreakCount = addCounters(
+          BreakContinueStack.back().BreakCount, getRegion().getCounter());
     // FIXME: a break in a switch should terminate regions for all preceding
     // case statements, not just the most recent one.
     terminateRegion(S);
@@ -1452,8 +1467,9 @@ struct CounterCoverageMappingBuilder
 
   void VisitContinueStmt(const ContinueStmt *S) {
     assert(!BreakContinueStack.empty() && "continue stmt not in a loop!");
-    BreakContinueStack.back().ContinueCount = addCounters(
-        BreakContinueStack.back().ContinueCount, getRegion().getCounter());
+    if (!llvm::EnableSingleByteCoverage)
+      BreakContinueStack.back().ContinueCount = addCounters(
+          BreakContinueStack.back().ContinueCount, getRegion().getCounter());
     terminateRegion(S);
   }
 
@@ -1471,7 +1487,9 @@ struct CounterCoverageMappingBuilder
     extendRegion(S);
 
     Counter ParentCount = getRegion().getCounter();
-    Counter BodyCount = getRegionCounter(S);
+    Counter BodyCount = llvm::EnableSingleByteCoverage
+                            ? getRegionCounter(S->getBody())
+                            : getRegionCounter(S);
 
     // Handle the body first so that we can get the backedge count.
     BreakContinueStack.push_back(BreakContinue());
@@ -1484,7 +1502,9 @@ struct CounterCoverageMappingBuilder
 
     // Go back to handle the condition.
     Counter CondCount =
-        addCounters(ParentCount, BackedgeCount, BC.ContinueCount);
+        llvm::EnableSingleByteCoverage
+            ? getRegionCounter(S->getCond())
+            : addCounters(ParentCount, BackedgeCount, BC.ContinueCount);
     propagateCounts(CondCount, S->getCond());
     adjustForOutOfOrderTraversal(getEnd(S));
 
@@ -1494,7 +1514,11 @@ struct CounterCoverageMappingBuilder
       fillGapAreaWithCount(Gap->getBegin(), Gap->getEnd(), BodyCount);
 
     Counter OutCount =
-        addCounters(BC.BreakCount, subtractCounters(CondCount, BodyCount));
+        llvm::EnableSingleByteCoverage
+            ? getRegionCounter(S)
+            : addCounters(BC.BreakCount,
+                          subtractCounters(CondCount, BodyCount));
+
     if (OutCount != ParentCount) {
       pushRegion(OutCount);
       GapRegionCounter = OutCount;
@@ -1503,38 +1527,53 @@ struct CounterCoverageMappingBuilder
     }
 
     // Create Branch Region around condition.
-    createBranchRegion(S->getCond(), BodyCount,
-                       subtractCounters(CondCount, BodyCount));
+    if (!llvm::EnableSingleByteCoverage)
+      createBranchRegion(S->getCond(), BodyCount,
+                         subtractCounters(CondCount, BodyCount));
   }
 
   void VisitDoStmt(const DoStmt *S) {
     extendRegion(S);
 
     Counter ParentCount = getRegion().getCounter();
-    Counter BodyCount = getRegionCounter(S);
+    Counter BodyCount = llvm::EnableSingleByteCoverage
+                            ? getRegionCounter(S->getBody())
+                            : getRegionCounter(S);
 
     BreakContinueStack.push_back(BreakContinue());
     extendRegion(S->getBody());
-    Counter BackedgeCount =
-        propagateCounts(addCounters(ParentCount, BodyCount), S->getBody());
+
+    Counter BackedgeCount;
+    if (llvm::EnableSingleByteCoverage)
+      propagateCounts(BodyCount, S->getBody());
+    else
+      BackedgeCount =
+          propagateCounts(addCounters(ParentCount, BodyCount), S->getBody());
+
     BreakContinue BC = BreakContinueStack.pop_back_val();
 
     bool BodyHasTerminateStmt = HasTerminateStmt;
     HasTerminateStmt = false;
 
-    Counter CondCount = addCounters(BackedgeCount, BC.ContinueCount);
+    Counter CondCount = llvm::EnableSingleByteCoverage
+                            ? getRegionCounter(S->getCond())
+                            : addCounters(BackedgeCount, BC.ContinueCount);
     propagateCounts(CondCount, S->getCond());
 
     Counter OutCount =
-        addCounters(BC.BreakCount, subtractCounters(CondCount, BodyCount));
+        llvm::EnableSingleByteCoverage
+            ? getRegionCounter(S)
+            : addCounters(BC.BreakCount,
+                          subtractCounters(CondCount, BodyCount));
     if (OutCount != ParentCount) {
       pushRegion(OutCount);
       GapRegionCounter = OutCount;
     }
 
     // Create Branch Region around condition.
-    createBranchRegion(S->getCond(), BodyCount,
-                       subtractCounters(CondCount, BodyCount));
+    if (!llvm::EnableSingleByteCoverage)
+      createBranchRegion(S->getCond(), BodyCount,
+                         subtractCounters(CondCount, BodyCount));
 
     if (BodyHasTerminateStmt)
       HasTerminateStmt = true;
@@ -1546,7 +1585,9 @@ struct CounterCoverageMappingBuilder
       Visit(S->getInit());
 
     Counter ParentCount = getRegion().getCounter();
-    Counter BodyCount = getRegionCounter(S);
+    Counter BodyCount = llvm::EnableSingleByteCoverage
+                            ? getRegionCounter(S->getBody())
+                            : getRegionCounter(S);
 
     // The loop increment may contain a break or continue.
     if (S->getInc())
@@ -1565,14 +1606,23 @@ struct CounterCoverageMappingBuilder
     // the count for all the continue statements.
     BreakContinue IncrementBC;
     if (const Stmt *Inc = S->getInc()) {
-      propagateCounts(addCounters(BackedgeCount, BodyBC.ContinueCount), Inc);
+      Counter IncCount;
+      if (llvm::EnableSingleByteCoverage)
+        IncCount = getRegionCounter(S->getInc());
+      else
+        IncCount = addCounters(BackedgeCount, BodyBC.ContinueCount);
+      propagateCounts(IncCount, Inc);
       IncrementBC = BreakContinueStack.pop_back_val();
     }
 
     // Go back to handle the condition.
-    Counter CondCount = addCounters(
-        addCounters(ParentCount, BackedgeCount, BodyBC.ContinueCount),
-        IncrementBC.ContinueCount);
+    Counter CondCount =
+        llvm::EnableSingleByteCoverage
+            ? getRegionCounter(S->getCond())
+            : addCounters(
+                  addCounters(ParentCount, BackedgeCount, BodyBC.ContinueCount),
+                  IncrementBC.ContinueCount);
+
     if (const Expr *Cond = S->getCond()) {
       propagateCounts(CondCount, Cond);
       adjustForOutOfOrderTraversal(getEnd(S));
@@ -1583,8 +1633,11 @@ struct CounterCoverageMappingBuilder
     if (Gap)
       fillGapAreaWithCount(Gap->getBegin(), Gap->getEnd(), BodyCount);
 
-    Counter OutCount = addCounters(BodyBC.BreakCount, IncrementBC.BreakCount,
-                                   subtractCounters(CondCount, BodyCount));
+    Counter OutCount =
+        llvm::EnableSingleByteCoverage
+            ? getRegionCounter(S)
+            : addCounters(BodyBC.BreakCount, IncrementBC.BreakCount,
+                          subtractCounters(CondCount, BodyCount));
     if (OutCount != ParentCount) {
       pushRegion(OutCount);
       GapRegionCounter = OutCount;
@@ -1593,8 +1646,9 @@ struct CounterCoverageMappingBuilder
     }
 
     // Create Branch Region around condition.
-    createBranchRegion(S->getCond(), BodyCount,
-                       subtractCounters(CondCount, BodyCount));
+    if (!llvm::EnableSingleByteCoverage)
+      createBranchRegion(S->getCond(), BodyCount,
+                         subtractCounters(CondCount, BodyCount));
   }
 
   void VisitCXXForRangeStmt(const CXXForRangeStmt *S) {
@@ -1605,7 +1659,9 @@ struct CounterCoverageMappingBuilder
     Visit(S->getRangeStmt());
 
     Counter ParentCount = getRegion().getCounter();
-    Counter BodyCount = getRegionCounter(S);
+    Counter BodyCount = llvm::EnableSingleByteCoverage
+                            ? getRegionCounter(S->getBody())
+                            : getRegionCounter(S);
 
     BreakContinueStack.push_back(BreakContinue());
     extendRegion(S->getBody());
@@ -1620,10 +1676,15 @@ struct CounterCoverageMappingBuilder
     if (Gap)
       fillGapAreaWithCount(Gap->getBegin(), Gap->getEnd(), BodyCount);
 
-    Counter LoopCount =
-        addCounters(ParentCount, BackedgeCount, BC.ContinueCount);
-    Counter OutCount =
-        addCounters(BC.BreakCount, subtractCounters(LoopCount, BodyCount));
+    Counter OutCount;
+    Counter LoopCount;
+    if (llvm::EnableSingleByteCoverage)
+      OutCount = getRegionCounter(S);
+    else {
+      LoopCount = addCounters(ParentCount, BackedgeCount, BC.ContinueCount);
+      OutCount =
+          addCounters(BC.BreakCount, subtractCounters(LoopCount, BodyCount));
+    }
     if (OutCount != ParentCount) {
       pushRegion(OutCount);
       GapRegionCounter = OutCount;
@@ -1632,8 +1693,9 @@ struct CounterCoverageMappingBuilder
     }
 
     // Create Branch Region around condition.
-    createBranchRegion(S->getCond(), BodyCount,
-                       subtractCounters(LoopCount, BodyCount));
+    if (!llvm::EnableSingleByteCoverage)
+      createBranchRegion(S->getCond(), BodyCount,
+                         subtractCounters(LoopCount, BodyCount));
   }
 
   void VisitObjCForCollectionStmt(const ObjCForCollectionStmt *S) {
@@ -1694,7 +1756,7 @@ struct CounterCoverageMappingBuilder
       propagateCounts(Counter::getZero(), Body);
     BreakContinue BC = BreakContinueStack.pop_back_val();
 
-    if (!BreakContinueStack.empty())
+    if (!BreakContinueStack.empty() && !llvm::EnableSingleByteCoverage)
       BreakContinueStack.back().ContinueCount = addCounters(
           BreakContinueStack.back().ContinueCount, BC.ContinueCount);
 
@@ -1709,6 +1771,11 @@ struct CounterCoverageMappingBuilder
     MostRecentLocation = getStart(S);
     handleFileExit(ExitLoc);
 
+    // When single byte coverage mode is enabled, do not create branch region by
+    // early returning.
+    if (llvm::EnableSingleByteCoverage)
+      return;
+
     // Create a Branch Region around each Case. Subtract the case's
     // counter from the Parent counter to track the "False" branch count.
     Counter CaseCountSum;
@@ -1741,8 +1808,10 @@ struct CounterCoverageMappingBuilder
     extendRegion(S);
 
     SourceMappingRegion &Parent = getRegion();
+    Counter Count = llvm::EnableSingleByteCoverage
+                        ? getRegionCounter(S)
+                        : addCounters(Parent.getCounter(), getRegionCounter(S));
 
-    Counter Count = addCounters(Parent.getCounter(), getRegionCounter(S));
     // Reuse the existing region if it starts at our label. This is typical of
     // the first case in a switch.
     if (Parent.hasStartLoc() && Parent.getBeginLoc() == getStart(S))
@@ -1860,7 +1929,9 @@ struct CounterCoverageMappingBuilder
     extendRegion(S->getCond());
 
     Counter ParentCount = getRegion().getCounter();
-    Counter ThenCount = getRegionCounter(S);
+    Counter ThenCount = llvm::EnableSingleByteCoverage
+                            ? getRegionCounter(S->getThen())
+                            : getRegionCounter(S);
 
     // Emitting a counter for the condition makes it easier to interpret the
     // counter for the body when looking at the coverage.
@@ -1874,7 +1945,12 @@ struct CounterCoverageMappingBuilder
 
     extendRegion(S->getThen());
     Counter OutCount = propagateCounts(ThenCount, S->getThen());
-    Counter ElseCount = subtractCounters(ParentCount, ThenCount);
+
+    Counter ElseCount;
+    if (!llvm::EnableSingleByteCoverage)
+      ElseCount = subtractCounters(ParentCount, ThenCount);
+    else if (S->getElse())
+      ElseCount = getRegionCounter(S->getElse());
 
     if (const Stmt *Else = S->getElse()) {
       bool ThenHasTerminateStmt = HasTerminateStmt;
@@ -1885,21 +1961,28 @@ struct CounterCoverageMappingBuilder
       if (Gap)
         fillGapAreaWithCount(Gap->getBegin(), Gap->getEnd(), ElseCount);
       extendRegion(Else);
-      OutCount = addCounters(OutCount, propagateCounts(ElseCount, Else));
+
+      Counter ElseOutCount = propagateCounts(ElseCount, Else);
+      if (!llvm::EnableSingleByteCoverage)
+        OutCount = addCounters(OutCount, ElseOutCount);
 
       if (ThenHasTerminateStmt)
         HasTerminateStmt = true;
-    } else
+    } else if (!llvm::EnableSingleByteCoverage)
       OutCount = addCounters(OutCount, ElseCount);
 
+    if (llvm::EnableSingleByteCoverage)
+      OutCount = getRegionCounter(S);
+
     if (OutCount != ParentCount) {
       pushRegion(OutCount);
       GapRegionCounter = OutCount;
     }
 
-    // Create Branch Region around condition.
-    createBranchRegion(S->getCond(), ThenCount,
-                       subtractCounters(ParentCount, ThenCount));
+    if (!S->isConsteval() && !llvm::EnableSingleByteCoverage)
+      // Create Branch Region around condition.
+      createBranchRegion(S->getCond(), ThenCount,
+                         subtractCounters(ParentCount, ThenCount));
   }
 
   void VisitCXXTryStmt(const CXXTryStmt *S) {
@@ -1925,7 +2008,9 @@ struct CounterCoverageMappingBuilder
     extendRegion(E);
 
     Counter ParentCount = getRegion().getCounter();
-    Counter TrueCount = getRegionCounter(E);
+    Counter TrueCount = llvm::EnableSingleByteCoverage
+                            ? getRegionCounter(E->getTrueExpr())
+                            : getRegionCounter(E);
 
     propagateCounts(ParentCount, E->getCond());
     Counter OutCount;
@@ -1944,9 +2029,15 @@ struct CounterCoverageMappingBuilder
     }
 
     extendRegion(E->getFalseExpr());
-    OutCount = addCounters(
-        OutCount, propagateCounts(subtractCounters(ParentCount, TrueCount),
-                                  E->getFalseExpr()));
+    Counter FalseCount = llvm::EnableSingleByteCoverage
+                             ? getRegionCounter(E->getFalseExpr())
+                             : subtractCounters(ParentCount, TrueCount);
+
+    Counter FalseOutCount = propagateCounts(FalseCount, E->getFalseExpr());
+    if (llvm::EnableSingleByteCoverage)
+      OutCount = getRegionCounter(E);
+    else
+      OutCount = addCounters(OutCount, FalseOutCount);
 
     if (OutCount != ParentCount) {
       pushRegion(OutCount);
@@ -1954,8 +2045,9 @@ struct CounterCoverageMappingBuilder
     }
 
     // Create Branch Region around condition.
-    createBranchRegion(E->getCond(), TrueCount,
-                       subtractCounters(ParentCount, TrueCount));
+    if (!llvm::EnableSingleByteCoverage)
+      createBranchRegion(E->getCond(), TrueCount,
+                         subtractCounters(ParentCount, TrueCount));
   }
 
   void createDecision(const BinaryOperator *E) {
@@ -2002,12 +2094,14 @@ struct CounterCoverageMappingBuilder
     Counter ParentCnt = getRegion().getCounter();
 
     // Create Branch Region around LHS condition.
-    createBranchRegion(E->getLHS(), RHSExecCnt,
-                       subtractCounters(ParentCnt, RHSExecCnt), DecisionLHS);
+    if (!llvm::EnableSingleByteCoverage)
+      createBranchRegion(E->getLHS(), RHSExecCnt,
+                         subtractCounters(ParentCnt, RHSExecCnt), DecisionLHS);
 
     // Create Branch Region around RHS condition.
-    createBranchRegion(E->getRHS(), RHSTrueCnt,
-                       subtractCounters(RHSExecCnt, RHSTrueCnt), DecisionRHS);
+    if (!llvm::EnableSingleByteCoverage)
+      createBranchRegion(E->getRHS(), RHSTrueCnt,
+                         subtractCounters(RHSExecCnt, RHSTrueCnt), DecisionRHS);
 
     // Create MCDC Decision Region if at top-level (root).
     if (IsRootNode)
@@ -2058,12 +2152,14 @@ struct CounterCoverageMappingBuilder
     Counter ParentCnt = getRegion().getCounter();
 
     // Create Branch Region around LHS condition.
-    createBranchRegion(E->getLHS(), subtractCounters(ParentCnt, RHSExecCnt),
-                       RHSExecCnt, DecisionLHS);
+    if (!llvm::EnableSingleByteCoverage)
+      createBranchRegion(E->getLHS(), subtractCounters(ParentCnt, RHSExecCnt),
+                         RHSExecCnt, DecisionLHS);
 
     // Create Branch Region around RHS condition.
-    createBranchRegion(E->getRHS(), subtractCounters(RHSExecCnt, RHSFalseCnt),
-                       RHSFalseCnt, DecisionRHS);
+    if (!llvm::EnableSingleByteCoverage)
+      createBranchRegion(E->getRHS(), subtractCounters(RHSExecCnt, RHSFalseCnt),
+                         RHSFalseCnt, DecisionRHS);
 
     // Create MCDC Decision Region if at top-level (root).
     if (IsRootNode)
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index 94f8e7be2ee6..adfdd5163519 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -9,6 +9,7 @@
 #include "ABIInfoImpl.h"
 #include "TargetInfo.h"
 #include "clang/Basic/DiagnosticFrontend.h"
+#include "llvm/TargetParser/AArch64TargetParser.h"
 
 using namespace clang;
 using namespace clang::CodeGen;
@@ -75,6 +76,12 @@ private:
   bool allowBFloatArgsAndRet() const override {
     return getTarget().hasBFloat16Type();
   }
+
+  using ABIInfo::appendAttributeMangling;
+  void appendAttributeMangling(TargetClonesAttr *Attr, unsigned Index,
+                               raw_ostream &Out) const override;
+  void appendAttributeMangling(StringRef AttrStr,
+                               raw_ostream &Out) const override;
 };
 
 class AArch64SwiftABIInfo : public SwiftABIInfo {
@@ -857,6 +864,39 @@ void AArch64TargetCodeGenInfo::checkFunctionCallABI(
           << Callee->getDeclName();
 }
 
+void AArch64ABIInfo::appendAttributeMangling(TargetClonesAttr *Attr,
+                                             unsigned Index,
+                                             raw_ostream &Out) const {
+  appendAttributeMangling(Attr->getFeatureStr(Index), Out);
+}
+
+void AArch64ABIInfo::appendAttributeMangling(StringRef AttrStr,
+                                             raw_ostream &Out) const {
+  if (AttrStr == "default") {
+    Out << ".default";
+    return;
+  }
+
+  Out << "._";
+  SmallVector<StringRef, 8> Features;
+  AttrStr.split(Features, "+");
+  for (auto &Feat : Features)
+    Feat = Feat.trim();
+
+  // FIXME: It was brought up in #79316 that sorting the features which are
+  // used for mangling based on their multiversion priority is not a good
+  // practice. Changing the feature priorities will break the ABI. Perhaps
+  // it would be preferable to perform a lexicographical sort instead.
+  const TargetInfo &TI = CGT.getTarget();
+  llvm::sort(Features, [&TI](const StringRef LHS, const StringRef RHS) {
+    return TI.multiVersionSortPriority(LHS) < TI.multiVersionSortPriority(RHS);
+  });
+
+  for (auto &Feat : Features)
+    if (auto Ext = llvm::AArch64::parseArchExtension(Feat))
+      Out << 'M' << Ext->Name;
+}
+
 std::unique_ptr<TargetCodeGenInfo>
 CodeGen::createAArch64TargetCodeGenInfo(CodeGenModule &CGM,
                                         AArch64ABIKind Kind) {
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index bd854aae35d4..08b1fd01b3c0 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -681,19 +681,29 @@ std::string ToolChain::getCompilerRT(const ArgList &Args, StringRef Component,
   // Check for runtime files in the new layout without the architecture first.
   std::string CRTBasename =
       buildCompilerRTBasename(Args, Component, Type, /*AddArch=*/false);
+  SmallString<128> Path;
   for (const auto &LibPath : getLibraryPaths()) {
     SmallString<128> P(LibPath);
     llvm::sys::path::append(P, CRTBasename);
     if (getVFS().exists(P))
       return std::string(P);
+    if (Path.empty())
+      Path = P;
   }
+  if (getTriple().isOSAIX())
+    Path.clear();
 
-  // Fall back to the old expected compiler-rt name if the new one does not
-  // exist.
+  // Check the filename for the old layout if the new one does not exist.
   CRTBasename =
       buildCompilerRTBasename(Args, Component, Type, /*AddArch=*/true);
-  SmallString<128> Path(getCompilerRTPath());
-  llvm::sys::path::append(Path, CRTBasename);
+  SmallString<128> OldPath(getCompilerRTPath());
+  llvm::sys::path::append(OldPath, CRTBasename);
+  if (Path.empty() || getVFS().exists(OldPath))
+    return std::string(OldPath);
+
+  // If none is found, use a file name from the new layout, which may get
+  // printed in an error message, aiding users in knowing what Clang is
+  // looking for.
   return std::string(Path);
 }
 
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 6e1b7e8657d0..66c3a237c121 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5958,7 +5958,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
 
   if (Arg *A = Args.getLastArg(options::OPT_fbasic_block_address_map,
                                options::OPT_fno_basic_block_address_map)) {
-    if (Triple.isX86() && Triple.isOSBinFormatELF()) {
+    if ((Triple.isX86() || Triple.isAArch64()) && Triple.isOSBinFormatELF()) {
       if (A->getOption().matches(options::OPT_fbasic_block_address_map))
         A->render(Args, CmdArgs);
     } else {
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 347b250260c4..faceee85a2f8 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1316,13 +1316,16 @@ void tools::addFortranRuntimeLibs(const ToolChain &TC, const ArgList &Args,
   // add the correct libraries to link against as dependents in the object
   // file.
   if (!TC.getTriple().isKnownWindowsMSVCEnvironment()) {
-    StringRef f128LibName = TC.getDriver().getFlangF128MathLibrary();
-    f128LibName.consume_front_insensitive("lib");
-    if (!f128LibName.empty()) {
+    StringRef F128LibName = TC.getDriver().getFlangF128MathLibrary();
+    F128LibName.consume_front_insensitive("lib");
+    if (!F128LibName.empty()) {
+      bool AsNeeded = !TC.getTriple().isOSAIX();
       CmdArgs.push_back("-lFortranFloat128Math");
-      addAsNeededOption(TC, Args, CmdArgs, /*as_needed=*/true);
-      CmdArgs.push_back(Args.MakeArgString("-l" + f128LibName));
-      addAsNeededOption(TC, Args, CmdArgs, /*as_needed=*/false);
+      if (AsNeeded)
+        addAsNeededOption(TC, Args, CmdArgs, /*as_needed=*/true);
+      CmdArgs.push_back(Args.MakeArgString("-l" + F128LibName));
+      if (AsNeeded)
+        addAsNeededOption(TC, Args, CmdArgs, /*as_needed=*/false);
     }
     CmdArgs.push_back("-lFortranRuntime");
     CmdArgs.push_back("-lFortranDecimal");
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 794e326fb1c9..e64ba7eebc1c 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -917,6 +917,8 @@ template <> struct MappingTraits<FormatStyle> {
                    Style.AlignConsecutiveShortCaseStatements);
     IO.mapOptional("AlignConsecutiveTableGenCondOperatorColons",
                    Style.AlignConsecutiveTableGenCondOperatorColons);
+    IO.mapOptional("AlignConsecutiveTableGenDefinitionColons",
+                   Style.AlignConsecutiveTableGenDefinitionColons);
     IO.mapOptional("AlignEscapedNewlines", Style.AlignEscapedNewlines);
     IO.mapOptional("AlignOperands", Style.AlignOperands);
     IO.mapOptional("AlignTrailingComments", Style.AlignTrailingComments);
@@ -1423,6 +1425,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.AlignConsecutiveMacros = {};
   LLVMStyle.AlignConsecutiveShortCaseStatements = {};
   LLVMStyle.AlignConsecutiveTableGenCondOperatorColons = {};
+  LLVMStyle.AlignConsecutiveTableGenDefinitionColons = {};
   LLVMStyle.AlignEscapedNewlines = FormatStyle::ENAS_Right;
   LLVMStyle.AlignOperands = FormatStyle::OAS_Align;
   LLVMStyle.AlignTrailingComments = {};
@@ -3923,7 +3926,7 @@ FormatStyle::LanguageKind guessLanguage(StringRef FileName, StringRef Code) {
     auto Extension = llvm::sys::path::extension(FileName);
     // If there's no file extension (or it's .h), we need to check the contents
     // of the code to see if it contains Objective-C.
-    if (Extension.empty() || Extension == ".h") {
+    if (!Code.empty() && (Extension.empty() || Extension == ".h")) {
       auto NonEmptyFileName = FileName.empty() ? "guess.h" : FileName;
       Environment Env(Code, NonEmptyFileName, /*Ranges=*/{});
       ObjCHeaderStyleGuesser Guesser(Env, getLLVMStyle());
diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp
index dd9d5847a10d..753be25bfd67 100644
--- a/clang/lib/Format/WhitespaceManager.cpp
+++ b/clang/lib/Format/WhitespaceManager.cpp
@@ -111,8 +111,10 @@ const tooling::Replacements &WhitespaceManager::generateReplacements() {
   alignConsecutiveDeclarations();
   alignConsecutiveBitFields();
   alignConsecutiveAssignments();
-  if (Style.isTableGen())
+  if (Style.isTableGen()) {
     alignConsecutiveTableGenCondOperatorColons();
+    alignConsecutiveTableGenDefinitions();
+  }
   alignChainedConditionals();
   alignTrailingComments();
   alignEscapedNewlines();
@@ -984,6 +986,11 @@ void WhitespaceManager::alignConsecutiveTableGenCondOperatorColons() {
                          TT_TableGenCondOperatorColon);
 }
 
+void WhitespaceManager::alignConsecutiveTableGenDefinitions() {
+  alignConsecutiveColons(Style.AlignConsecutiveTableGenDefinitionColons,
+                         TT_InheritanceColon);
+}
+
 void WhitespaceManager::alignConsecutiveDeclarations() {
   if (!Style.AlignConsecutiveDeclarations.Enabled)
     return;
diff --git a/clang/lib/Format/WhitespaceManager.h b/clang/lib/Format/WhitespaceManager.h
index c604cdb6f185..9942e0f35738 100644
--- a/clang/lib/Format/WhitespaceManager.h
+++ b/clang/lib/Format/WhitespaceManager.h
@@ -243,6 +243,9 @@ private:
   /// Align consecutive TableGen cond operator colon over all \c Changes.
   void alignConsecutiveTableGenCondOperatorColons();
 
+  /// Align consecutive TableGen definitions over all \c Changes.
+  void alignConsecutiveTableGenDefinitions();
+
   /// Align trailing comments over all \c Changes.
   void alignTrailingComments();
 
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 96e3ebdecbdf..1d451b5f5b25 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -2099,9 +2099,11 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
 }
 
 /// Adds, with saturation, the corresponding elements of two 128-bit
-///    signed [16 x i8] vectors, saving each sum in the corresponding element of
-///    a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
-///    saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
+///    signed [16 x i8] vectors, saving each sum in the corresponding element
+///    of a 128-bit result vector of [16 x i8].
+///
+///    Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
+///    less than 0x80 are saturated to 0x80.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2119,10 +2121,11 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
 }
 
 /// Adds, with saturation, the corresponding elements of two 128-bit
-///    signed [8 x i16] vectors, saving each sum in the corresponding element of
-///    a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
-///    are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
-///    0x8000.
+///    signed [8 x i16] vectors, saving each sum in the corresponding element
+///    of a 128-bit result vector of [8 x i16].
+///
+///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
+///    less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2141,8 +2144,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
 
 /// Adds, with saturation, the corresponding elements of two 128-bit
 ///    unsigned [16 x i8] vectors, saving each sum in the corresponding element
-///    of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
-///    are saturated to 0xFF. Negative sums are saturated to 0x00.
+///    of a 128-bit result vector of [16 x i8].
+///
+///    Positive sums greater than 0xFF are saturated to 0xFF. Negative sums are
+///    saturated to 0x00.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2161,8 +2166,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
 
 /// Adds, with saturation, the corresponding elements of two 128-bit
 ///    unsigned [8 x i16] vectors, saving each sum in the corresponding element
-///    of a 128-bit result vector of [8 x i16]. Positive sums greater than
-///    0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
+///    of a 128-bit result vector of [8 x i16].
+///
+///    Positive sums greater than 0xFFFF are saturated to 0xFFFF. Negative sums
+///    are saturated to 0x0000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2518,10 +2525,12 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
   return (__m128i)((__v2du)__a - (__v2du)__b);
 }
 
-/// Subtracts corresponding 8-bit signed integer values in the input and
-///    returns the differences in the corresponding bytes in the destination.
-///    Differences greater than 0x7F are saturated to 0x7F, and differences less
-///    than 0x80 are saturated to 0x80.
+/// Subtracts, with saturation, corresponding 8-bit signed integer values in
+///    the input and returns the differences in the corresponding bytes in the
+///    destination.
+///
+///    Differences greater than 0x7F are saturated to 0x7F, and differences
+///    less than 0x80 are saturated to 0x80.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2538,8 +2547,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
   return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
 }
 
-/// Subtracts corresponding 16-bit signed integer values in the input and
-///    returns the differences in the corresponding bytes in the destination.
+/// Subtracts, with saturation, corresponding 16-bit signed integer values in
+///    the input and returns the differences in the corresponding bytes in the
+///    destination.
+///
 ///    Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
 ///    than 0x8000 are saturated to 0x8000.
 ///
@@ -2558,9 +2569,11 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
   return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
 }
 
-/// Subtracts corresponding 8-bit unsigned integer values in the input
-///    and returns the differences in the corresponding bytes in the
-///    destination. Differences less than 0x00 are saturated to 0x00.
+/// Subtracts, with saturation, corresponding 8-bit unsigned integer values in
+///    the input and returns the differences in the corresponding bytes in the
+///    destination.
+///
+///    Differences less than 0x00 are saturated to 0x00.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2577,9 +2590,11 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
   return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
 }
 
-/// Subtracts corresponding 16-bit unsigned integer values in the input
-///    and returns the differences in the corresponding bytes in the
-///    destination. Differences less than 0x0000 are saturated to 0x0000.
+/// Subtracts, with saturation, corresponding 16-bit unsigned integer values in
+///    the input and returns the differences in the corresponding bytes in the
+///    destination.
+///
+///    Differences less than 0x0000 are saturated to 0x0000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -4050,26 +4065,22 @@ void _mm_mfence(void);
 } // extern "C"
 #endif
 
-/// Converts 16-bit signed integers from both 128-bit integer vector
-///    operands into 8-bit signed integers, and packs the results into the
-///    destination. Positive values greater than 0x7F are saturated to 0x7F.
-///    Negative values less than 0x80 are saturated to 0x80.
+/// Converts, with saturation, 16-bit signed integers from both 128-bit integer
+///    vector operands into 8-bit signed integers, and packs the results into
+///    the destination.
+///
+///    Positive values greater than 0x7F are saturated to 0x7F. Negative values
+///    less than 0x80 are saturated to 0x80.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
 ///
 /// \param __a
-///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
-///   a signed integer and is converted to a 8-bit signed integer with
-///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
-///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
+///   A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
 ///   written to the lower 64 bits of the result.
 /// \param __b
-///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
-///   a signed integer and is converted to a 8-bit signed integer with
-///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
-///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
+///   A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
 ///   written to the higher 64 bits of the result.
 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
@@ -4077,26 +4088,22 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
 }
 
-/// Converts 32-bit signed integers from both 128-bit integer vector
-///    operands into 16-bit signed integers, and packs the results into the
-///    destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
-///    Negative values less than 0x8000 are saturated to 0x8000.
+/// Converts, with saturation, 32-bit signed integers from both 128-bit integer
+///    vector operands into 16-bit signed integers, and packs the results into
+///    the destination.
+///
+///    Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
+///    values less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
 ///
 /// \param __a
-///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
-///    a signed integer and is converted to a 16-bit signed integer with
-///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
-///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
+///    A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
 ///    are written to the lower 64 bits of the result.
 /// \param __b
-///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
-///    a signed integer and is converted to a 16-bit signed integer with
-///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
-///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
+///    A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
 ///    are written to the higher 64 bits of the result.
 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
@@ -4104,26 +4111,22 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
 }
 
-/// Converts 16-bit signed integers from both 128-bit integer vector
-///    operands into 8-bit unsigned integers, and packs the results into the
-///    destination. Values greater than 0xFF are saturated to 0xFF. Values less
-///    than 0x00 are saturated to 0x00.
+/// Converts, with saturation, 16-bit signed integers from both 128-bit integer
+///    vector operands into 8-bit unsigned integers, and packs the results into
+///    the destination.
+///
+///    Values greater than 0xFF are saturated to 0xFF. Values less than 0x00
+///    are saturated to 0x00.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
 ///
 /// \param __a
-///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
-///    a signed integer and is converted to an 8-bit unsigned integer with
-///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
-///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
+///    A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
 ///    written to the lower 64 bits of the result.
 /// \param __b
-///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
-///    a signed integer and is converted to an 8-bit unsigned integer with
-///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
-///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
+///    A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
 ///    written to the higher 64 bits of the result.
 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
diff --git a/clang/lib/Headers/fmaintrin.h b/clang/lib/Headers/fmaintrin.h
index ea832fac4f99..22d1a780bbfd 100644
--- a/clang/lib/Headers/fmaintrin.h
+++ b/clang/lib/Headers/fmaintrin.h
@@ -60,7 +60,8 @@ _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 
 /// Computes a scalar multiply-add of the single-precision values in the
 ///    low 32 bits of 128-bit vectors of [4 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
 /// result[127:32] = __A[127:32]
 /// \endcode
@@ -88,7 +89,8 @@ _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 
 /// Computes a scalar multiply-add of the double-precision values in the
 ///    low 64 bits of 128-bit vectors of [2 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
 /// result[127:64] = __A[127:64]
 /// \endcode
@@ -156,7 +158,8 @@ _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 
 /// Computes a scalar multiply-subtract of the single-precision values in
 ///    the low 32 bits of 128-bit vectors of [4 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
 /// result[127:32] = __A[127:32]
 /// \endcode
@@ -184,7 +187,8 @@ _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 
 /// Computes a scalar multiply-subtract of the double-precision values in
 ///    the low 64 bits of 128-bit vectors of [2 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
 /// result[127:64] = __A[127:64]
 /// \endcode
@@ -252,7 +256,8 @@ _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 
 /// Computes a scalar negated multiply-add of the single-precision values in
 ///    the low 32 bits of 128-bit vectors of [4 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
 /// result[127:32] = __A[127:32]
 /// \endcode
@@ -280,7 +285,8 @@ _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 
 /// Computes a scalar negated multiply-add of the double-precision values
 ///    in the low 64 bits of 128-bit vectors of [2 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
 /// result[127:64] = __A[127:64]
 /// \endcode
@@ -348,7 +354,8 @@ _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 
 /// Computes a scalar negated multiply-subtract of the single-precision
 ///    values in the low 32 bits of 128-bit vectors of [4 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
 /// result[127:32] = __A[127:32]
 /// \endcode
@@ -376,7 +383,8 @@ _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 
 /// Computes a scalar negated multiply-subtract of the double-precision
 ///    values in the low 64 bits of 128-bit vectors of [2 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
 /// result[127:64] = __A[127:64]
 /// \endcode
@@ -404,7 +412,8 @@ _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 
 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
 ///    [4 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0]  = (__A[31:0] * __B[31:0]) - __C[31:0]
 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
@@ -430,7 +439,8 @@ _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
 
 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
 ///    [2 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0]  = (__A[63:0] * __B[63:0]) - __C[63:0]
 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
 /// \endcode
@@ -454,7 +464,8 @@ _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
 
 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
 ///    [4 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0]  = (__A[31:0] * __B[31:0]) + __C[31:0]
 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
@@ -480,7 +491,8 @@ _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
 
 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
 ///    [2 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0]  = (__A[63:0] * __B[63:0]) + __C[63:0]
 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
 /// \endcode
@@ -664,7 +676,8 @@ _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 
 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
 ///    [8 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
@@ -694,7 +707,8 @@ _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
 
 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
 ///    [4 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
 /// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
@@ -720,7 +734,8 @@ _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
 
 /// Computes a vector multiply with alternating add/subtract of 256-bit
 ///    vectors of [8 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
@@ -750,7 +765,8 @@ _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
 
 /// Computes a vector multiply with alternating add/subtract of 256-bit
 ///    vectors of [4 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
 /// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h
index 08849f01071a..962d24738e7a 100644
--- a/clang/lib/Headers/mmintrin.h
+++ b/clang/lib/Headers/mmintrin.h
@@ -105,28 +105,23 @@ _mm_cvtm64_si64(__m64 __m)
     return (long long)__m;
 }
 
-/// Converts 16-bit signed integers from both 64-bit integer vector
-///    parameters of [4 x i16] into 8-bit signed integer values, and constructs
-///    a 64-bit integer vector of [8 x i8] as the result. Positive values
-///    greater than 0x7F are saturated to 0x7F. Negative values less than 0x80
-///    are saturated to 0x80.
+/// Converts, with saturation, 16-bit signed integers from both 64-bit integer
+///    vector parameters of [4 x i16] into 8-bit signed integer values, and
+///    constructs a 64-bit integer vector of [8 x i8] as the result.
+///
+///    Positive values greater than 0x7F are saturated to 0x7F. Negative values
+///    less than 0x80 are saturated to 0x80.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> PACKSSWB </c> instruction.
 ///
 /// \param __m1
-///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
-///    16-bit signed integer and is converted to an 8-bit signed integer with
-///    saturation. Positive values greater than 0x7F are saturated to 0x7F.
-///    Negative values less than 0x80 are saturated to 0x80. The converted
-///    [4 x i8] values are written to the lower 32 bits of the result.
+///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
+///    written to the lower 32 bits of the result.
 /// \param __m2
-///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
-///    16-bit signed integer and is converted to an 8-bit signed integer with
-///    saturation. Positive values greater than 0x7F are saturated to 0x7F.
-///    Negative values less than 0x80 are saturated to 0x80. The converted
-///    [4 x i8] values are written to the upper 32 bits of the result.
+///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
+///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
@@ -135,28 +130,23 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2)
     return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
 }
 
-/// Converts 32-bit signed integers from both 64-bit integer vector
-///    parameters of [2 x i32] into 16-bit signed integer values, and constructs
-///    a 64-bit integer vector of [4 x i16] as the result. Positive values
-///    greater than 0x7FFF are saturated to 0x7FFF. Negative values less than
-///    0x8000 are saturated to 0x8000.
+/// Converts, with saturation, 32-bit signed integers from both 64-bit integer
+///    vector parameters of [2 x i32] into 16-bit signed integer values, and
+///    constructs a 64-bit integer vector of [4 x i16] as the result.
+///
+///    Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
+///    values less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> PACKSSDW </c> instruction.
 ///
 /// \param __m1
-///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
-///    32-bit signed integer and is converted to a 16-bit signed integer with
-///    saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
-///    Negative values less than 0x8000 are saturated to 0x8000. The converted
-///    [2 x i16] values are written to the lower 32 bits of the result.
+///    A 64-bit integer vector of [2 x i32]. The converted [2 x i16] values are
+///    written to the lower 32 bits of the result.
 /// \param __m2
-///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
-///    32-bit signed integer and is converted to a 16-bit signed integer with
-///    saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
-///    Negative values less than 0x8000 are saturated to 0x8000. The converted
-///    [2 x i16] values are written to the upper 32 bits of the result.
+///    A 64-bit integer vector of [2 x i32]. The converted [2 x i16] values are
+///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
 ///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
@@ -165,28 +155,23 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2)
     return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
 }
 
-/// Converts 16-bit signed integers from both 64-bit integer vector
-///    parameters of [4 x i16] into 8-bit unsigned integer values, and
-///    constructs a 64-bit integer vector of [8 x i8] as the result. Values
-///    greater than 0xFF are saturated to 0xFF. Values less than 0 are saturated
-///    to 0.
+/// Converts, with saturation, 16-bit signed integers from both 64-bit integer
+///    vector parameters of [4 x i16] into 8-bit unsigned integer values, and
+///    constructs a 64-bit integer vector of [8 x i8] as the result.
+///
+///    Values greater than 0xFF are saturated to 0xFF. Values less than 0 are
+///    saturated to 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> PACKUSWB </c> instruction.
 ///
 /// \param __m1
-///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
-///    16-bit signed integer and is converted to an 8-bit unsigned integer with
-///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
-///    than 0 are saturated to 0. The converted [4 x i8] values are written to
-///    the lower 32 bits of the result.
+///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
+///    written to the lower 32 bits of the result.
 /// \param __m2
-///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
-///    16-bit signed integer and is converted to an 8-bit unsigned integer with
-///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
-///    than 0 are saturated to 0. The converted [4 x i8] values are written to
-///    the upper 32 bits of the result.
+///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
+///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
@@ -400,11 +385,13 @@ _mm_add_pi32(__m64 __m1, __m64 __m2)
     return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
 }
 
-/// Adds each 8-bit signed integer element of the first 64-bit integer
-///    vector of [8 x i8] to the corresponding 8-bit signed integer element of
-///    the second 64-bit integer vector of [8 x i8]. Positive sums greater than
-///    0x7F are saturated to 0x7F. Negative sums less than 0x80 are saturated to
-///    0x80. The results are packed into a 64-bit integer vector of [8 x i8].
+/// Adds, with saturation, each 8-bit signed integer element of the first
+///    64-bit integer vector of [8 x i8] to the corresponding 8-bit signed
+///    integer element of the second 64-bit integer vector of [8 x i8].
+///
+///    Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
+///    less than 0x80 are saturated to 0x80. The results are packed into a
+///    64-bit integer vector of [8 x i8].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -422,12 +409,13 @@ _mm_adds_pi8(__m64 __m1, __m64 __m2)
     return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
 }
 
-/// Adds each 16-bit signed integer element of the first 64-bit integer
-///    vector of [4 x i16] to the corresponding 16-bit signed integer element of
-///    the second 64-bit integer vector of [4 x i16]. Positive sums greater than
-///    0x7FFF are saturated to 0x7FFF. Negative sums less than 0x8000 are
-///    saturated to 0x8000. The results are packed into a 64-bit integer vector
-///    of [4 x i16].
+/// Adds, with saturation, each 16-bit signed integer element of the first
+///    64-bit integer vector of [4 x i16] to the corresponding 16-bit signed
+///    integer element of the second 64-bit integer vector of [4 x i16].
+///
+///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
+///    less than 0x8000 are saturated to 0x8000. The results are packed into a
+///    64-bit integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -445,11 +433,12 @@ _mm_adds_pi16(__m64 __m1, __m64 __m2)
     return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
 }
 
-/// Adds each 8-bit unsigned integer element of the first 64-bit integer
-///    vector of [8 x i8] to the corresponding 8-bit unsigned integer element of
-///    the second 64-bit integer vector of [8 x i8]. Sums greater than 0xFF are
-///    saturated to 0xFF. The results are packed into a 64-bit integer vector of
-///    [8 x i8].
+/// Adds, with saturation, each 8-bit unsigned integer element of the first
+///    64-bit integer vector of [8 x i8] to the corresponding 8-bit unsigned
+///    integer element of the second 64-bit integer vector of [8 x i8].
+///
+///    Sums greater than 0xFF are saturated to 0xFF. The results are packed
+///    into a 64-bit integer vector of [8 x i8].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -467,11 +456,12 @@ _mm_adds_pu8(__m64 __m1, __m64 __m2)
     return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
 }
 
-/// Adds each 16-bit unsigned integer element of the first 64-bit integer
-///    vector of [4 x i16] to the corresponding 16-bit unsigned integer element
-///    of the second 64-bit integer vector of [4 x i16]. Sums greater than
-///    0xFFFF are saturated to 0xFFFF. The results are packed into a 64-bit
-///    integer vector of [4 x i16].
+/// Adds, with saturation, each 16-bit unsigned integer element of the first
+///    64-bit integer vector of [4 x i16] to the corresponding 16-bit unsigned
+///    integer element of the second 64-bit integer vector of [4 x i16].
+///
+///    Sums greater than 0xFFFF are saturated to 0xFFFF. The results are packed
+///    into a 64-bit integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -552,12 +542,13 @@ _mm_sub_pi32(__m64 __m1, __m64 __m2)
     return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
 }
 
-/// Subtracts each 8-bit signed integer element of the second 64-bit
-///    integer vector of [8 x i8] from the corresponding 8-bit signed integer
-///    element of the first 64-bit integer vector of [8 x i8]. Positive results
-///    greater than 0x7F are saturated to 0x7F. Negative results less than 0x80
-///    are saturated to 0x80. The results are packed into a 64-bit integer
-///    vector of [8 x i8].
+/// Subtracts, with saturation, each 8-bit signed integer element of the second
+///    64-bit integer vector of [8 x i8] from the corresponding 8-bit signed
+///    integer element of the first 64-bit integer vector of [8 x i8].
+///
+///    Positive results greater than 0x7F are saturated to 0x7F. Negative
+///    results less than 0x80 are saturated to 0x80. The results are packed
+///    into a 64-bit integer vector of [8 x i8].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -575,12 +566,13 @@ _mm_subs_pi8(__m64 __m1, __m64 __m2)
     return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
 }
 
-/// Subtracts each 16-bit signed integer element of the second 64-bit
-///    integer vector of [4 x i16] from the corresponding 16-bit signed integer
-///    element of the first 64-bit integer vector of [4 x i16]. Positive results
-///    greater than 0x7FFF are saturated to 0x7FFF. Negative results less than
-///    0x8000 are saturated to 0x8000. The results are packed into a 64-bit
-///    integer vector of [4 x i16].
+/// Subtracts, with saturation, each 16-bit signed integer element of the
+///    second 64-bit integer vector of [4 x i16] from the corresponding 16-bit
+///    signed integer element of the first 64-bit integer vector of [4 x i16].
+///
+///    Positive results greater than 0x7FFF are saturated to 0x7FFF. Negative
+///    results less than 0x8000 are saturated to 0x8000. The results are packed
+///    into a 64-bit integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
diff --git a/clang/lib/Headers/prfchwintrin.h b/clang/lib/Headers/prfchwintrin.h
index d2f91aa0123e..8a13784543c5 100644
--- a/clang/lib/Headers/prfchwintrin.h
+++ b/clang/lib/Headers/prfchwintrin.h
@@ -15,9 +15,10 @@
 #define __PRFCHWINTRIN_H
 
 /// Loads a memory sequence containing the specified memory address into
-///    all data cache levels. The cache-coherency state is set to exclusive.
-///    Data can be read from and written to the cache line without additional
-///    delay.
+///    all data cache levels.
+///
+///    The cache-coherency state is set to exclusive. Data can be read from
+///    and written to the cache line without additional delay.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -32,10 +33,11 @@ _m_prefetch(void *__P)
 }
 
 /// Loads a memory sequence containing the specified memory address into
-///    the L1 data cache and sets the cache-coherency to modified. This
-///    provides a hint to the processor that the cache line will be modified.
-///    It is intended for use when the cache line will be written to shortly
-///    after the prefetch is performed.
+///    the L1 data cache and sets the cache-coherency state to modified.
+///
+///    This provides a hint to the processor that the cache line will be
+///    modified. It is intended for use when the cache line will be written to
+///    shortly after the prefetch is performed.
 ///
 ///    Note that the effect of this intrinsic is dependent on the processor
 ///    implementation.
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index 005d7db9c3c3..c52ffb77e33d 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -1431,8 +1431,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) {
 }
 
 /* SSE4 Pack with Unsigned Saturation.  */
-/// Converts 32-bit signed integers from both 128-bit integer vector
-///    operands into 16-bit unsigned integers, and returns the packed result.
+/// Converts, with saturation, 32-bit signed integers from both 128-bit integer
+///    vector operands into 16-bit unsigned integers, and returns the packed
+///    result.
+///
 ///    Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
 ///    0x0000 are saturated to 0x0000.
 ///
@@ -1441,17 +1443,11 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) {
 /// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
 ///
 /// \param __V1
-///    A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
-///    signed integer and is converted to a 16-bit unsigned integer with
-///    saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
-///    less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
-///    are written to the lower 64 bits of the result.
+///    A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
+///    written to the lower 64 bits of the result.
 /// \param __V2
-///    A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
-///    signed integer and is converted to a 16-bit unsigned integer with
-///    saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
-///    less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
-///    are written to the higher 64 bits of the result.
+///    A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
+///    written to the higher 64 bits of the result.
 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1,
                                                               __m128i __V2) {
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index 7d8dc46c57bf..bf8327b692d1 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -271,10 +271,11 @@ _mm_hadd_pi32(__m64 __a, __m64 __b)
     return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
 }
 
-/// Horizontally adds the adjacent pairs of values contained in 2 packed
-///    128-bit vectors of [8 x i16]. Positive sums greater than 0x7FFF are
-///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
-///    0x8000.
+/// Horizontally adds, with saturation, the adjacent pairs of values contained
+///    in two packed 128-bit vectors of [8 x i16].
+///
+///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
+///    less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -296,10 +297,11 @@ _mm_hadds_epi16(__m128i __a, __m128i __b)
     return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
 }
 
-/// Horizontally adds the adjacent pairs of values contained in 2 packed
-///    64-bit vectors of [4 x i16]. Positive sums greater than 0x7FFF are
-///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
-///    0x8000.
+/// Horizontally adds, with saturation, the adjacent pairs of values contained
+///    in two packed 64-bit vectors of [4 x i16].
+///
+///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
+///    less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -413,10 +415,11 @@ _mm_hsub_pi32(__m64 __a, __m64 __b)
     return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
 }
 
-/// Horizontally subtracts the adjacent pairs of values contained in 2
-///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
-///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
-///    saturated to 0x8000.
+/// Horizontally subtracts, with saturation, the adjacent pairs of values
+///    contained in two packed 128-bit vectors of [8 x i16].
+///
+///    Positive differences greater than 0x7FFF are saturated to 0x7FFF.
+///    Negative differences less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -438,10 +441,11 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b)
     return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
 }
 
-/// Horizontally subtracts the adjacent pairs of values contained in 2
-///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
-///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
-///    saturated to 0x8000.
+/// Horizontally subtracts, with saturation, the adjacent pairs of values
+///    contained in two packed 64-bit vectors of [4 x i16].
+///
+///    Positive differences greater than 0x7FFF are saturated to 0x7FFF.
+///    Negative differences less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
diff --git a/clang/lib/InstallAPI/CMakeLists.txt b/clang/lib/InstallAPI/CMakeLists.txt
index fdc4f064f29e..19fc4c3abde5 100644
--- a/clang/lib/InstallAPI/CMakeLists.txt
+++ b/clang/lib/InstallAPI/CMakeLists.txt
@@ -1,11 +1,14 @@
 set(LLVM_LINK_COMPONENTS
   Support
   TextAPI
+  Core
   )
 
 add_clang_library(clangInstallAPI
   FileList.cpp
+  Frontend.cpp
   HeaderFile.cpp
+  Visitor.cpp
 
   LINK_LIBS
   clangAST
diff --git a/clang/lib/InstallAPI/Frontend.cpp b/clang/lib/InstallAPI/Frontend.cpp
new file mode 100644
index 000000000000..9f675ef7d1bd
--- /dev/null
+++ b/clang/lib/InstallAPI/Frontend.cpp
@@ -0,0 +1,58 @@
+//===- Frontend.cpp ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/InstallAPI/Frontend.h"
+#include "clang/AST/Availability.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+
+using namespace llvm;
+using namespace llvm::MachO;
+
+namespace clang::installapi {
+
+static StringRef getFileExtension(clang::Language Lang) {
+  switch (Lang) {
+  default:
+    llvm_unreachable("Unexpected language option.");
+  case clang::Language::C:
+    return ".c";
+  case clang::Language::CXX:
+    return ".cpp";
+  case clang::Language::ObjC:
+    return ".m";
+  case clang::Language::ObjCXX:
+    return ".mm";
+  }
+}
+
+std::unique_ptr<MemoryBuffer> createInputBuffer(const InstallAPIContext &Ctx) {
+  assert(Ctx.Type != HeaderType::Unknown &&
+         "unexpected access level for parsing");
+  SmallString<4096> Contents;
+  raw_svector_ostream OS(Contents);
+  for (const HeaderFile &H : Ctx.InputHeaders) {
+    if (H.getType() != Ctx.Type)
+      continue;
+    if (Ctx.LangMode == Language::C || Ctx.LangMode == Language::CXX)
+      OS << "#include ";
+    else
+      OS << "#import ";
+    if (H.useIncludeName())
+      OS << "<" << H.getIncludeName() << ">";
+    else
+      OS << "\"" << H.getPath() << "\"";
+  }
+  if (Contents.empty())
+    return nullptr;
+
+  return llvm::MemoryBuffer::getMemBufferCopy(
+      Contents, "installapi-includes" + getFileExtension(Ctx.LangMode));
+}
+
+} // namespace clang::installapi
diff --git a/clang/lib/InstallAPI/Visitor.cpp b/clang/lib/InstallAPI/Visitor.cpp
new file mode 100644
index 000000000000..9b333a6142ae
--- /dev/null
+++ b/clang/lib/InstallAPI/Visitor.cpp
@@ -0,0 +1,94 @@
+//===- Visitor.cpp ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/InstallAPI/Visitor.h"
+#include "clang/AST/Availability.h"
+#include "clang/Basic/Linkage.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Mangler.h"
+
+using namespace llvm;
+using namespace llvm::MachO;
+
+namespace clang::installapi {
+
+// Exported NamedDecl needs to have externally visibiliy linkage and
+// default visibility from LinkageComputer.
+static bool isExported(const NamedDecl *D) {
+  auto LV = D->getLinkageAndVisibility();
+  return isExternallyVisible(LV.getLinkage()) &&
+         (LV.getVisibility() == DefaultVisibility);
+}
+
+static SymbolFlags getFlags(bool WeakDef, bool ThreadLocal) {
+  SymbolFlags Result = SymbolFlags::None;
+  if (WeakDef)
+    Result |= SymbolFlags::WeakDefined;
+  if (ThreadLocal)
+    Result |= SymbolFlags::ThreadLocalValue;
+
+  return Result;
+}
+
+void InstallAPIVisitor::HandleTranslationUnit(ASTContext &ASTCtx) {
+  if (ASTCtx.getDiagnostics().hasErrorOccurred())
+    return;
+
+  auto *D = ASTCtx.getTranslationUnitDecl();
+  TraverseDecl(D);
+}
+
+std::string InstallAPIVisitor::getMangledName(const NamedDecl *D) const {
+  SmallString<256> Name;
+  if (MC->shouldMangleDeclName(D)) {
+    raw_svector_ostream NStream(Name);
+    MC->mangleName(D, NStream);
+  } else
+    Name += D->getNameAsString();
+
+  return getBackendMangledName(Name);
+}
+
+std::string InstallAPIVisitor::getBackendMangledName(Twine Name) const {
+  SmallString<256> FinalName;
+  Mangler::getNameWithPrefix(FinalName, Name, DataLayout(Layout));
+  return std::string(FinalName);
+}
+
+/// Collect all global variables.
+bool InstallAPIVisitor::VisitVarDecl(const VarDecl *D) {
+  // Skip function parameters.
+  if (isa<ParmVarDecl>(D))
+    return true;
+
+  // Skip variables in records. They are handled seperately for C++.
+  if (D->getDeclContext()->isRecord())
+    return true;
+
+  // Skip anything inside functions or methods.
+  if (!D->isDefinedOutsideFunctionOrMethod())
+    return true;
+
+  // If this is a template but not specialization or instantiation, skip.
+  if (D->getASTContext().getTemplateOrSpecializationInfo(D) &&
+      D->getTemplateSpecializationKind() == TSK_Undeclared)
+    return true;
+
+  // TODO: Capture SourceLocation & Availability for Decls.
+  const RecordLinkage Linkage =
+      isExported(D) ? RecordLinkage::Exported : RecordLinkage::Internal;
+  const bool WeakDef = D->hasAttr<WeakAttr>();
+  const bool ThreadLocal = D->getTLSKind() != VarDecl::TLS_None;
+  Slice.addGlobal(getMangledName(D), Linkage, GlobalRecord::Kind::Variable,
+                  getFlags(WeakDef, ThreadLocal));
+  return true;
+}
+
+} // namespace clang::installapi
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 571a98488402..438c6d772e6e 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -1513,8 +1513,10 @@ NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
                                                : APFloat::opInvalidOp;
 }
 
-static inline bool IsExponentPart(char c) {
-  return c == 'p' || c == 'P' || c == 'e' || c == 'E';
+static inline bool IsExponentPart(char c, bool isHex) {
+  if (isHex)
+    return c == 'p' || c == 'P';
+  return c == 'e' || c == 'E';
 }
 
 bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) {
@@ -1533,7 +1535,8 @@ bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Sc
   if (saw_exponent) {
     const char *Ptr = DigitsBegin;
 
-    while (!IsExponentPart(*Ptr)) ++Ptr;
+    while (!IsExponentPart(*Ptr, radix == 16))
+      ++Ptr;
     ExponentBegin = Ptr;
     ++Ptr;
     NegativeExponent = *Ptr == '-';
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 984088e345c8..0de76ee119cf 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2189,6 +2189,23 @@ static bool SemaBuiltinCpu(Sema &S, const TargetInfo &TI, CallExpr *TheCall,
   return false;
 }
 
+/// Checks that __builtin_popcountg was called with a single argument, which is
+/// an integer.
+static bool SemaBuiltinPopcountg(Sema &S, CallExpr *TheCall) {
+  if (checkArgCount(S, TheCall, 1))
+    return true;
+
+  Expr *Arg = TheCall->getArg(0);
+  QualType ArgTy = Arg->getType();
+
+  if (!ArgTy->isIntegerType()) {
+    S.Diag(Arg->getBeginLoc(), diag::err_builtin_invalid_arg_type)
+        << 1 << /*integer ty*/ 7 << ArgTy;
+    return true;
+  }
+  return false;
+}
+
 ExprResult
 Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
                                CallExpr *TheCall) {
@@ -2959,7 +2976,12 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
            diag::err_hip_invalid_args_builtin_mangled_name);
       return ExprError();
     }
+    break;
   }
+  case Builtin::BI__builtin_popcountg:
+    if (SemaBuiltinPopcountg(*this, TheCall))
+      return ExprError();
+    break;
   }
 
   if (getLangOpts().HLSL && CheckHLSLBuiltinFunctionCall(BuiltinID, TheCall))
@@ -16126,15 +16148,8 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
   // Diagnose conversions between different enumeration types.
   // In C, we pretend that the type of an EnumConstantDecl is its enumeration
   // type, to give us better diagnostics.
-  QualType SourceType = E->getType();
-  if (!S.getLangOpts().CPlusPlus) {
-    if (DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(E))
-      if (EnumConstantDecl *ECD = dyn_cast<EnumConstantDecl>(DRE->getDecl())) {
-        EnumDecl *Enum = cast<EnumDecl>(ECD->getDeclContext());
-        SourceType = S.Context.getTypeDeclType(Enum);
-        Source = S.Context.getCanonicalType(SourceType).getTypePtr();
-      }
-  }
+  QualType SourceType = E->getEnumCoercedType(S.Context);
+  Source = S.Context.getCanonicalType(SourceType).getTypePtr();
 
   if (const EnumType *SourceEnum = Source->getAs<EnumType>())
     if (const EnumType *TargetEnum = Target->getAs<EnumType>())
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 816ee9e28135..2a0e86c37f1b 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -1497,7 +1497,8 @@ static void checkEnumArithmeticConversions(Sema &S, Expr *LHS, Expr *RHS,
   //
   // Warn on this in all language modes. Produce a deprecation warning in C++20.
   // Eventually we will presumably reject these cases (in C++23 onwards?).
-  QualType L = LHS->getType(), R = RHS->getType();
+  QualType L = LHS->getEnumCoercedType(S.Context),
+           R = RHS->getEnumCoercedType(S.Context);
   bool LEnum = L->isUnscopedEnumerationType(),
        REnum = R->isUnscopedEnumerationType();
   bool IsCompAssign = ACK == Sema::ACK_CompAssign;
@@ -2653,7 +2654,7 @@ recoverFromMSUnqualifiedLookup(Sema &S, ASTContext &Context,
     RD = ThisType->getPointeeType()->getAsCXXRecordDecl();
   else if (auto *MD = dyn_cast<CXXMethodDecl>(S.CurContext))
     RD = MD->getParent();
-  if (!RD || !RD->hasAnyDependentBases())
+  if (!RD || !RD->hasDefinition() || !RD->hasAnyDependentBases())
     return nullptr;
 
   // Diagnose this as unqualified lookup into a dependent base class.  If 'this'
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 322bd1c87b1d..59758d3bd6d1 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -4843,7 +4843,7 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType,
                  .get();
       break;
     case ICK_Floating_Integral:
-      if (ToType->isRealFloatingType())
+      if (ToType->hasFloatingRepresentation())
         From =
             ImpCastExprToType(From, ToType, CK_IntegralToFloating, VK_PRValue,
                               /*BasePath=*/nullptr, CCK)
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index f7645422348b..7d38043890ca 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -1884,6 +1884,13 @@ static bool IsVectorElementConversion(Sema &S, QualType FromType,
     return true;
   }
 
+  if ((FromType->isRealFloatingType() && ToType->isIntegralType(S.Context)) ||
+      (FromType->isIntegralOrUnscopedEnumerationType() &&
+       ToType->isRealFloatingType())) {
+    ICK = ICK_Floating_Integral;
+    return true;
+  }
+
   if (S.IsIntegralPromotion(From, FromType, ToType)) {
     ICK = ICK_Integral_Promotion;
     return true;
@@ -1895,13 +1902,6 @@ static bool IsVectorElementConversion(Sema &S, QualType FromType,
     return true;
   }
 
-  if ((FromType->isRealFloatingType() && ToType->isIntegralType(S.Context)) ||
-      (FromType->isIntegralOrUnscopedEnumerationType() &&
-       ToType->isRealFloatingType())) {
-    ICK = ICK_Floating_Integral;
-    return true;
-  }
-
   return false;
 }
 
@@ -14571,6 +14571,23 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc,
                                        CurFPFeatureOverrides());
   }
 
+  // If this is the .* operator, which is not overloadable, just
+  // create a built-in binary operator.
+  if (Opc == BO_PtrMemD) {
+    auto CheckPlaceholder = [&](Expr *&Arg) {
+      ExprResult Res = CheckPlaceholderExpr(Arg);
+      if (Res.isUsable())
+        Arg = Res.get();
+      return !Res.isUsable();
+    };
+
+    // CreateBuiltinBinOp() doesn't like it if we tell it to create a '.*'
+    // expression that contains placeholders (in either the LHS or RHS).
+    if (CheckPlaceholder(Args[0]) || CheckPlaceholder(Args[1]))
+      return ExprError();
+    return CreateBuiltinBinOp(OpLoc, Opc, Args[0], Args[1]);
+  }
+
   // Always do placeholder-like conversions on the RHS.
   if (checkPlaceholderForOverload(*this, Args[1]))
     return ExprError();
@@ -14590,11 +14607,6 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc,
   if (Opc == BO_Assign && !Args[0]->getType()->isOverloadableType())
     return CreateBuiltinBinOp(OpLoc, Opc, Args[0], Args[1]);
 
-  // If this is the .* operator, which is not overloadable, just
-  // create a built-in binary operator.
-  if (Opc == BO_PtrMemD)
-    return CreateBuiltinBinOp(OpLoc, Opc, Args[0], Args[1]);
-
   // Build the overload set.
   OverloadCandidateSet CandidateSet(OpLoc, OverloadCandidateSet::CSK_Operator,
                                     OverloadCandidateSet::OperatorRewriteInfo(
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index defd83ec8e17..01b191ab0eea 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -310,8 +310,12 @@ public:
         return true;
       if (isa<EnumConstantDecl>(decl))
         return true;
-      if (auto *VD = dyn_cast<VarDecl>(decl))
-        return VD->hasConstantInitialization() && VD->getEvaluatedValue();
+      if (auto *VD = dyn_cast<VarDecl>(decl)) {
+        if (VD->hasConstantInitialization() && VD->getEvaluatedValue())
+          return true;
+        auto *Init = VD->getInit();
+        return !Init || Visit(Init);
+      }
     }
     return false;
   }
diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c
index a6244c3af202..2a72c24b43d1 100644
--- a/clang/test/AST/Interp/c.c
+++ b/clang/test/AST/Interp/c.c
@@ -180,3 +180,19 @@ void test4(void) {
   t1 = sizeof(int);
 }
 
+void localCompoundLiteral(void) {
+  struct S { int x, y; } s = {}; // pedantic-expected-warning {{use of an empty initializer}} \
+                                 // pedantic-ref-warning {{use of an empty initializer}}
+  struct T {
+	int i;
+    struct S s;
+  } t1 = { 1, {} }; // pedantic-expected-warning {{use of an empty initializer}} \
+                    // pedantic-ref-warning {{use of an empty initializer}}
+
+  struct T t3 = {
+    (int){}, // pedantic-expected-warning {{use of an empty initializer}} \
+             // pedantic-ref-warning {{use of an empty initializer}}
+    {} // pedantic-expected-warning {{use of an empty initializer}} \
+       // pedantic-ref-warning {{use of an empty initializer}}
+  };
+}
diff --git a/clang/test/AST/Interp/cxx11.cpp b/clang/test/AST/Interp/cxx11.cpp
index 0a1e0f3fd28e..993e3618a378 100644
--- a/clang/test/AST/Interp/cxx11.cpp
+++ b/clang/test/AST/Interp/cxx11.cpp
@@ -22,3 +22,11 @@ int array2[recurse2]; // both-warning {{variable length arrays in C++}} \
                       // both-note {{initializer of 'recurse2' is not a constant expression}} \
                       // expected-error {{variable length array declaration not allowed at file scope}} \
                       // ref-warning {{variable length array folded to constant array as an extension}}
+
+struct S {
+  int m;
+};
+constexpr S s = { 5 };
+constexpr const int *p = &s.m + 1;
+
+constexpr const int *np2 = &(*(int(*)[4])nullptr)[0]; // ok
diff --git a/clang/test/AST/Interp/cxx20.cpp b/clang/test/AST/Interp/cxx20.cpp
index 78c09661c6dd..000ffe39eb94 100644
--- a/clang/test/AST/Interp/cxx20.cpp
+++ b/clang/test/AST/Interp/cxx20.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fcxx-exceptions -fexperimental-new-constant-interpreter -std=c++20 -verify %s
-// RUN: %clang_cc1 -fcxx-exceptions -std=c++20 -verify=ref %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexperimental-new-constant-interpreter -std=c++20 -verify=both,expected -fcxx-exceptions %s
+// RUN: %clang_cc1 -fcxx-exceptions -std=c++20 -verify=both,ref -fcxx-exceptions %s
 
 void test_alignas_operand() {
   alignas(8) char dummy;
@@ -58,13 +58,10 @@ static_assert(pointerAssign2() == 12, "");
 
 constexpr int unInitLocal() {
   int a;
-  return a; // ref-note {{read of uninitialized object}} \
-            // expected-note {{read of uninitialized object}}
+  return a; // both-note {{read of uninitialized object}}
 }
-static_assert(unInitLocal() == 0, ""); // ref-error {{not an integral constant expression}} \
-                                       // ref-note {{in call to 'unInitLocal()'}} \
-                                       // expected-error {{not an integral constant expression}} \
-                                       // expected-note {{in call to 'unInitLocal()'}} \
+static_assert(unInitLocal() == 0, ""); // both-error {{not an integral constant expression}} \
+                                       // both-note {{in call to 'unInitLocal()'}}
 
 constexpr int initializedLocal() {
   int a;
@@ -75,25 +72,19 @@ static_assert(initializedLocal() == 20);
 
 constexpr int initializedLocal2() {
   int a[2];
-  return *a; // expected-note {{read of uninitialized object is not allowed in a constant expression}} \
-             // ref-note {{read of uninitialized object is not allowed in a constant expression}}
+  return *a; // both-note {{read of uninitialized object is not allowed in a constant expression}}
 }
-static_assert(initializedLocal2() == 20); // expected-error {{not an integral constant expression}} \
-                                          // expected-note {{in call to}} \
-                                          // ref-error {{not an integral constant expression}} \
-                                          // ref-note {{in call to}}
+static_assert(initializedLocal2() == 20); // both-error {{not an integral constant expression}} \
+                                          // both-note {{in call to}}
 
 
 struct Int { int a; };
 constexpr int initializedLocal3() {
   Int i;
-  return i.a; // ref-note {{read of uninitialized object is not allowed in a constant expression}} \
-              // expected-note {{read of uninitialized object}}
+  return i.a; // both-note {{read of uninitialized object is not allowed in a constant expression}}
 }
-static_assert(initializedLocal3() == 20); // expected-error {{not an integral constant expression}} \
-                                          // expected-note {{in call to}} \
-                                          // ref-error {{not an integral constant expression}} \
-                                          // ref-note {{in call to}}
+static_assert(initializedLocal3() == 20); // both-error {{not an integral constant expression}} \
+                                          // both-note {{in call to}}
 
 
 
@@ -137,22 +128,16 @@ static_assert(!b4); // ref-error {{not an integral constant expression}} \
 namespace UninitializedFields {
   class A {
   public:
-    int a; // expected-note 4{{subobject declared here}} \
-           // ref-note 4{{subobject declared here}}
+    int a; // both-note 4{{subobject declared here}}
     constexpr A() {}
   };
-  constexpr A a; // expected-error {{must be initialized by a constant expression}} \
-                 // expected-note {{subobject 'a' is not initialized}} \
-                 // ref-error {{must be initialized by a constant expression}} \
-                 // ref-note {{subobject 'a' is not initialized}}
-  constexpr A aarr[2]; // expected-error {{must be initialized by a constant expression}} \
-                       // expected-note {{subobject 'a' is not initialized}} \
-                       // ref-error {{must be initialized by a constant expression}} \
-                       // ref-note {{subobject 'a' is not initialized}}
+  constexpr A a; // both-error {{must be initialized by a constant expression}} \
+                 // both-note {{subobject 'a' is not initialized}}
+  constexpr A aarr[2]; // both-error {{must be initialized by a constant expression}} \
+                       // both-note {{subobject 'a' is not initialized}}
   class F {
     public:
-      int f; // expected-note 3{{subobject declared here}} \
-             // ref-note 3{{subobject declared here}}
+      int f; // both-note 3{{subobject declared here}}
 
       constexpr F() {}
       constexpr F(bool b) {
@@ -161,26 +146,19 @@ namespace UninitializedFields {
       }
   };
 
-  constexpr F foo[2] = {true}; // expected-error {{must be initialized by a constant expression}} \
-                               // expected-note {{subobject 'f' is not initialized}} \
-                               // ref-error {{must be initialized by a constant expression}} \
-                               // ref-note {{subobject 'f' is not initialized}}
-  constexpr F foo2[3] = {true, false, true}; // expected-error {{must be initialized by a constant expression}} \
-                                             // expected-note {{subobject 'f' is not initialized}} \
-                                             // ref-error {{must be initialized by a constant expression}} \
-                                             // ref-note {{subobject 'f' is not initialized}}
-  constexpr F foo3[3] = {true, true, F()}; // expected-error {{must be initialized by a constant expression}} \
-                                           // expected-note {{subobject 'f' is not initialized}} \
-                                           // ref-error {{must be initialized by a constant expression}} \
-                                           // ref-note {{subobject 'f' is not initialized}}
+  constexpr F foo[2] = {true}; // both-error {{must be initialized by a constant expression}} \
+                               // both-note {{subobject 'f' is not initialized}}
+  constexpr F foo2[3] = {true, false, true}; // both-error {{must be initialized by a constant expression}} \
+                                             // both-note {{subobject 'f' is not initialized}}
+  constexpr F foo3[3] = {true, true, F()}; // both-error {{must be initialized by a constant expression}} \
+                                           // both-note {{subobject 'f' is not initialized}}
 
 
 
   class Base {
   public:
     bool b;
-    int a; // expected-note {{subobject declared here}} \
-           // ref-note {{subobject declared here}}
+    int a; // both-note {{subobject declared here}}
     constexpr Base() : b(true) {}
   };
 
@@ -188,56 +166,44 @@ namespace UninitializedFields {
   public:
     constexpr Derived() : Base() {}   };
 
-  constexpr Derived D; // expected-error {{must be initialized by a constant expression}} \
-                       // expected-note {{subobject 'a' is not initialized}} \
-                       // ref-error {{must be initialized by a constant expression}} \
-                       // ref-note {{subobject 'a' is not initialized}}
+  constexpr Derived D; // both-error {{must be initialized by a constant expression}} \
+                       // both-note {{subobject 'a' is not initialized}}
 
   class C2 {
   public:
     A a;
     constexpr C2() {}   };
-  constexpr C2 c2; // expected-error {{must be initialized by a constant expression}} \
-                   // expected-note {{subobject 'a' is not initialized}} \
-                   // ref-error {{must be initialized by a constant expression}} \
-                   // ref-note {{subobject 'a' is not initialized}}
+  constexpr C2 c2; // both-error {{must be initialized by a constant expression}} \
+                   // both-note {{subobject 'a' is not initialized}}
 
   class C3 {
   public:
     A a[2];
     constexpr C3() {}
   };
-  constexpr C3 c3; // expected-error {{must be initialized by a constant expression}} \
-                   // expected-note {{subobject 'a' is not initialized}} \
-                   // ref-error {{must be initialized by a constant expression}} \
-                   // ref-note {{subobject 'a' is not initialized}}
+  constexpr C3 c3; // both-error {{must be initialized by a constant expression}} \
+                   // both-note {{subobject 'a' is not initialized}}
 
   class C4 {
   public:
-    bool B[2][3]; // expected-note {{subobject declared here}} \
-                  // ref-note {{subobject declared here}}
+    bool B[2][3]; // both-note {{subobject declared here}}
     constexpr C4(){}
   };
-  constexpr C4 c4; // expected-error {{must be initialized by a constant expression}} \
-                   // expected-note {{subobject 'B' is not initialized}} \
-                   // ref-error {{must be initialized by a constant expression}} \
-                   // ref-note {{subobject 'B' is not initialized}}
+  constexpr C4 c4; // both-error {{must be initialized by a constant expression}} \
+                   // both-note {{subobject 'B' is not initialized}}
 };
 
 namespace ConstThis {
   class Foo {
-    const int T = 12; // expected-note {{declared const here}} \
-                      // ref-note {{declared const here}}
+    const int T = 12; // both-note {{declared const here}}
     int a;
   public:
     constexpr Foo() {
       this->a = 10;
-      T = 13; // expected-error {{cannot assign to non-static data member 'T' with const-qualified type}} \
-              // ref-error {{cannot assign to non-static data member 'T' with const-qualified type}}
+      T = 13; // both-error {{cannot assign to non-static data member 'T' with const-qualified type}}
     }
   };
-  constexpr Foo F; // expected-error {{must be initialized by a constant expression}} \
-                   // ref-error {{must be initialized by a constant expression}}
+  constexpr Foo F; // both-error {{must be initialized by a constant expression}}
 
 
   class FooDtor {
@@ -264,8 +230,7 @@ namespace ConstThis {
     constexpr ctor_test() {
       if (Good)
         a = 10;
-      int local = 100 / a; // expected-note {{division by zero}} \
-                           // ref-note {{division by zero}}
+      int local = 100 / a; // both-note {{division by zero}}
     }
   };
 
@@ -277,22 +242,17 @@ namespace ConstThis {
     constexpr ~dtor_test() {
       if (Good)
         a = 10;
-      int local = 100 / a; // expected-note {{division by zero}} \
-                           // ref-note {{division by zero}}
+      int local = 100 / a; // both-note {{division by zero}}
     }
   };
 
   constexpr ctor_test<true> good_ctor;
   constexpr dtor_test<true> good_dtor;
 
-  constexpr ctor_test<false> bad_ctor; // expected-error {{must be initialized by a constant expression}} \
-                                       // expected-note {{in call to}} \
-                                       // ref-error {{must be initialized by a constant expression}} \
-                                       // ref-note {{in call to}}
-  constexpr dtor_test<false> bad_dtor; // expected-error {{must have constant destruction}} \
-                                       // expected-note {{in call to}} \
-                                       // ref-error {{must have constant destruction}} \
-                                       // ref-note {{in call to}}
+  constexpr ctor_test<false> bad_ctor; // both-error {{must be initialized by a constant expression}} \
+                                       // both-note {{in call to}}
+  constexpr dtor_test<false> bad_dtor; // both-error {{must have constant destruction}} \
+                                       // both-note {{in call to}}
 };
 
 namespace BaseInit {
@@ -311,10 +271,8 @@ namespace BaseInit {
   };
 
   static_assert(Final{1, 2, 3}.c == 3, ""); // OK
-  static_assert(Final{1, 2, 3}.a == 0, ""); // expected-error {{not an integral constant expression}} \
-                                            // expected-note {{read of uninitialized object}} \
-                                            // ref-error {{not an integral constant expression}} \
-                                            // ref-note {{read of uninitialized object}}
+  static_assert(Final{1, 2, 3}.a == 0, ""); // both-error {{not an integral constant expression}} \
+                                            // both-note {{read of uninitialized object}}
 
 
   struct Mixin  {
@@ -333,10 +291,8 @@ namespace BaseInit {
 
   static_assert(Final2{1, 2, 3}.c == 3, ""); // OK
   static_assert(Final2{1, 2, 3}.b == 2, ""); // OK
-  static_assert(Final2{1, 2, 3}.a == 0, ""); // expected-error {{not an integral constant expression}} \
-                                             // expected-note {{read of uninitialized object}} \
-                                             // ref-error {{not an integral constant expression}} \
-                                             // ref-note {{read of uninitialized object}}
+  static_assert(Final2{1, 2, 3}.a == 0, ""); // both-error {{not an integral constant expression}} \
+                                             // both-note {{read of uninitialized object}}
 
 
   struct Mixin3  {
@@ -352,10 +308,8 @@ namespace BaseInit {
 
   static_assert(Final3{1, 2, 3}.c == 3, ""); // OK
   static_assert(Final3{1, 2, 3}.b == 2, ""); // OK
-  static_assert(Final3{1, 2, 3}.a == 0, ""); // expected-error {{not an integral constant expression}} \
-                                             // expected-note {{read of uninitialized object}} \
-                                             // ref-error {{not an integral constant expression}} \
-                                             // ref-note {{read of uninitialized object}}
+  static_assert(Final3{1, 2, 3}.a == 0, ""); // both-error {{not an integral constant expression}} \
+                                             // both-note {{read of uninitialized object}}
 };
 
 namespace Destructors {
@@ -633,16 +587,13 @@ namespace ImplicitFunction {
 
    /// The operator= call here will fail and the diagnostics should be fine.
    b = a; // ref-note {{subobject 'a' is not initialized}} \
-          // ref-note {{in call to}} \
           // expected-note {{read of uninitialized object}} \
-          // expected-note {{in call to}}
+          // both-note {{in call to}}
 
    return 1;
   }
-  static_assert(callMe() == 1, ""); // ref-error {{not an integral constant expression}} \
-                                    // ref-note {{in call to 'callMe()'}} \
-                                    // expected-error {{not an integral constant expression}} \
-                                    // expected-note {{in call to 'callMe()'}}
+  static_assert(callMe() == 1, ""); // both-error {{not an integral constant expression}} \
+                                    // both-note {{in call to 'callMe()'}}
 }
 
 /// FIXME: Unfortunately, the similar tests in test/SemaCXX/{compare-cxx2a.cpp use member pointers,
@@ -680,8 +631,7 @@ namespace ThreeWayCmp {
   static_assert(1.0 <=> 2.f == -1, "");
   static_assert(1.0 <=> 1.0 == 0, "");
   static_assert(2.0 <=> 1.0 == 1, "");
-  constexpr int k = (1 <=> 1, 0); // expected-warning {{comparison result unused}} \
-                                  // ref-warning {{comparison result unused}}
+  constexpr int k = (1 <=> 1, 0); // both-warning {{comparison result unused}}
   static_assert(k== 0, "");
 
   /// Pointers.
@@ -690,10 +640,8 @@ namespace ThreeWayCmp {
   constexpr const int *pa1 = &a[1];
   constexpr const int *pa2 = &a[2];
   constexpr const int *pb1 = &b[1];
-  static_assert(pa1 <=> pb1 != 0, ""); // expected-error {{not an integral constant expression}} \
-                                       // expected-note {{has unspecified value}} \
-                                       // ref-error {{not an integral constant expression}} \
-                                       // ref-note {{has unspecified value}}
+  static_assert(pa1 <=> pb1 != 0, ""); // both-error {{not an integral constant expression}} \
+                                       // both-note {{has unspecified value}} \
   static_assert(pa1 <=> pa1 == 0, "");
   static_assert(pa1 <=> pa2 == -1, "");
   static_assert(pa2 <=> pa1 == 1, "");
@@ -776,3 +724,53 @@ namespace RewrittenBinaryOperators {
   };
   static_assert(X() < X(), "");
 }
+
+namespace GH61417 {
+struct A {
+  unsigned x : 1;
+  unsigned   : 0;
+  unsigned y : 1;
+
+  constexpr A() : x(0), y(0) {}
+  bool operator==(const A& rhs) const noexcept = default;
+};
+
+void f1() {
+  constexpr A a, b;
+  constexpr bool c = (a == b); // no diagnostic, we should not be comparing the
+                               // unnamed bit-field which is indeterminate
+}
+
+void f2() {
+    A a, b;
+    bool c = (a == b); // no diagnostic nor crash during codegen attempting to
+                       // access info for unnamed bit-field
+}
+}
+
+namespace FailingDestructor {
+  struct D {
+    int n;
+    bool can_destroy;
+
+    constexpr ~D() {
+      if (!can_destroy)
+        throw "oh no";
+    }
+  };
+  template<D d>
+  void f() {} // both-note {{invalid explicitly-specified argument}}
+
+  void g() {
+    f<D{0, false}>(); // both-error {{no matching function}}
+  }
+}
+
+
+void overflowInSwitchCase(int n) {
+  switch (n) {
+  case (int)(float)1e300: // both-error {{constant expression}} \
+                          // both-note {{value +Inf is outside the range of representable values of type 'int'}}
+    break;
+  }
+}
diff --git a/clang/test/AST/Interp/spaceship.cpp b/clang/test/AST/Interp/spaceship.cpp
new file mode 100644
index 000000000000..7495b893772e
--- /dev/null
+++ b/clang/test/AST/Interp/spaceship.cpp
@@ -0,0 +1,232 @@
+// RUN: %clang_cc1 -std=c++2a -verify=both,ref %s -fcxx-exceptions
+// RUN: %clang_cc1 -std=c++2a -verify=both,expected %s -fcxx-exceptions -fexperimental-new-constant-interpreter
+
+namespace std {
+  struct strong_ordering { // both-note 6{{candidate}}
+    int n;
+    constexpr operator int() const { return n; }
+    static const strong_ordering less, equal, greater;
+  };
+  constexpr strong_ordering strong_ordering::less{-1},
+      strong_ordering::equal{0}, strong_ordering::greater{1};
+
+  struct weak_ordering {
+    int n;
+    constexpr weak_ordering(int n) : n(n) {}
+    constexpr weak_ordering(strong_ordering o) : n(o.n) {}
+    constexpr operator int() const { return n; }
+    static const weak_ordering less, equivalent, greater;
+  };
+  constexpr weak_ordering weak_ordering::less{-1},
+      weak_ordering::equivalent{0}, weak_ordering::greater{1};
+
+  struct partial_ordering {
+    double d;
+    constexpr partial_ordering(double d) : d(d) {}
+    constexpr partial_ordering(strong_ordering o) : d(o.n) {}
+    constexpr partial_ordering(weak_ordering o) : d(o.n) {}
+    constexpr operator double() const { return d; }
+    static const partial_ordering less, equivalent, greater, unordered;
+  };
+  constexpr partial_ordering partial_ordering::less{-1},
+      partial_ordering::equivalent{0}, partial_ordering::greater{1},
+      partial_ordering::unordered{__builtin_nan("")};
+
+  static_assert(!(partial_ordering::unordered < 0));
+  static_assert(!(partial_ordering::unordered == 0));
+  static_assert(!(partial_ordering::unordered > 0));
+}
+
+namespace Deletedness {
+  struct A {
+    std::strong_ordering operator<=>(const A&) const;
+  };
+  struct B {
+    bool operator==(const B&) const;
+    bool operator<(const B&) const;
+  };
+  struct C {
+    std::strong_ordering operator<=>(const C&) const = delete; // both-note 2{{deleted}}
+  };
+  struct D1 {
+    bool operator==(const D1&) const;
+    std::strong_ordering operator<=>(int) const; // both-note 2{{function not viable}} both-note 2{{function (with reversed parameter order) not viable}}
+    bool operator<(int) const; // both-note 2{{function not viable}}
+  };
+  struct D2 {
+    bool operator<(const D2&) const;
+    std::strong_ordering operator<=>(int) const; // both-note 2{{function not viable}} both-note 2{{function (with reversed parameter order) not viable}}
+    bool operator==(int) const; // both-note 2{{function not viable}}
+  };
+  struct E {
+    bool operator==(const E&) const;
+    bool operator<(const E&) const = delete; // both-note 2{{deleted}}
+  };
+  struct F {
+    std::strong_ordering operator<=>(const F&) const; // both-note 2{{candidate}}
+    std::strong_ordering operator<=>(F) const; // both-note 2{{candidate}}
+  };
+  struct G1 {
+    bool operator==(const G1&) const;
+    void operator<(const G1&) const;
+  };
+  struct G2 {
+    void operator==(const G2&) const;
+    bool operator<(const G2&) const;
+  };
+  struct H {
+    void operator<=>(const H&) const;
+  };
+
+  // both-note@#base {{deleted comparison function for base class 'C'}}
+  // both-note@#base {{no viable three-way comparison function for base class 'D1'}}
+  // both-note@#base {{three-way comparison cannot be synthesized because there is no viable function for '<' comparison}}
+  // both-note@#base {{no viable 'operator==' for base class 'D2'}}
+  // both-note@#base {{three-way comparison cannot be synthesized because there is no viable function for '==' comparison}}
+  // both-note@#base {{deleted comparison function for base class 'E'}}
+  // both-note@#base {{implied comparison for base class 'F' is ambiguous}}
+  template<typename T> struct Cmp : T { // #base
+    std::strong_ordering operator<=>(const Cmp&) const = default; // #cmp both-note 5{{here}}
+  };
+
+  void use(...);
+  void f() {
+    use(
+      Cmp<A>() <=> Cmp<A>(),
+      Cmp<B>() <=> Cmp<B>(),
+      Cmp<C>() <=> Cmp<C>(), // both-error {{deleted}}
+      Cmp<D1>() <=> Cmp<D1>(), // both-error {{deleted}}
+      Cmp<D2>() <=> Cmp<D2>(), // both-error {{deleted}}
+      Cmp<E>() <=> Cmp<E>(), // both-error {{deleted}}
+      Cmp<F>() <=> Cmp<F>(), // both-error {{deleted}}
+      // FIXME: The following three errors are not very good.
+      // both-error@#cmp {{value of type 'void' is not contextually convertible to 'bool'}}
+      Cmp<G1>() <=> Cmp<G1>(), // both-note-re {{in defaulted three-way comparison operator for '{{.*}}Cmp<{{.*}}G1>' first required here}}j
+      // both-error@#cmp {{value of type 'void' is not contextually convertible to 'bool'}}
+      Cmp<G2>() <=> Cmp<G2>(), // both-note-re {{in defaulted three-way comparison operator for '{{.*}}Cmp<{{.*}}G2>' first required here}}j
+      // both-error@#cmp {{no matching conversion for static_cast from 'void' to 'std::strong_ordering'}}
+      Cmp<H>() <=> Cmp<H>(), // both-note-re {{in defaulted three-way comparison operator for '{{.*}}Cmp<{{.*}}H>' first required here}}j
+      0
+    );
+  }
+
+  // both-note@#arr {{deleted comparison function for member 'arr'}}
+  // both-note@#arr {{no viable three-way comparison function for member 'arr'}}
+  // both-note@#arr {{three-way comparison cannot be synthesized because there is no viable function for '<' comparison}}
+  // both-note@#arr {{no viable 'operator==' for member 'arr'}}
+  // both-note@#arr {{three-way comparison cannot be synthesized because there is no viable function for '==' comparison}}
+  // both-note@#arr {{deleted comparison function for member 'arr'}}
+  // both-note@#arr {{implied comparison for member 'arr' is ambiguous}}
+  template<typename T> struct CmpArray {
+    T arr[3]; // #arr
+    std::strong_ordering operator<=>(const CmpArray&) const = default; // #cmparray both-note 5{{here}}
+  };
+  void g() {
+    use(
+      CmpArray<A>() <=> CmpArray<A>(),
+      CmpArray<B>() <=> CmpArray<B>(),
+      CmpArray<C>() <=> CmpArray<C>(), // both-error {{deleted}}
+      CmpArray<D1>() <=> CmpArray<D1>(), // both-error {{deleted}}
+      CmpArray<D2>() <=> CmpArray<D2>(), // both-error {{deleted}}
+      CmpArray<E>() <=> CmpArray<E>(), // both-error {{deleted}}
+      CmpArray<F>() <=> CmpArray<F>(), // both-error {{deleted}}
+      // FIXME: The following three errors are not very good.
+      // both-error@#cmparray {{value of type 'void' is not contextually convertible to 'bool'}}
+      CmpArray<G1>() <=> CmpArray<G1>(), // both-note-re {{in defaulted three-way comparison operator for '{{.*}}CmpArray<{{.*}}G1>' first required here}}j
+      // both-error@#cmparray {{value of type 'void' is not contextually convertible to 'bool'}}
+      CmpArray<G2>() <=> CmpArray<G2>(), // both-note-re {{in defaulted three-way comparison operator for '{{.*}}CmpArray<{{.*}}G2>' first required here}}j
+      // both-error@#cmparray {{no matching conversion for static_cast from 'void' to 'std::strong_ordering'}}
+      CmpArray<H>() <=> CmpArray<H>(), // both-note-re {{in defaulted three-way comparison operator for '{{.*}}CmpArray<{{.*}}H>' first required here}}j
+      0
+    );
+  }
+}
+
+namespace Access {
+  class A {
+    std::strong_ordering operator<=>(const A &) const; // both-note {{here}}
+  public:
+    bool operator==(const A &) const;
+    bool operator<(const A &) const;
+  };
+  struct B {
+    A a; // both-note {{would invoke a private 'operator<=>'}}
+    friend std::strong_ordering operator<=>(const B &, const B &) = default; // both-warning {{deleted}} both-note{{replace 'default'}}
+  };
+
+  class C {
+    std::strong_ordering operator<=>(const C &); // not viable (not const)
+    bool operator==(const C &) const; // both-note {{here}}
+    bool operator<(const C &) const;
+  };
+  struct D {
+    C c; // both-note {{would invoke a private 'operator=='}}
+    friend std::strong_ordering operator<=>(const D &, const D &) = default; // both-warning {{deleted}} both-note{{replace 'default'}}
+  };
+}
+
+namespace Synthesis {
+  enum Result { False, True, Mu };
+
+  constexpr bool toBool(Result R) {
+    if (R == Mu) throw "should not ask this question";
+    return R == True;
+  }
+
+  struct Val {
+    Result equal, less;
+    constexpr bool operator==(const Val&) const { return toBool(equal); }
+    constexpr bool operator<(const Val&) const { return toBool(less); }
+  };
+
+  template<typename T> struct Cmp {
+    Val val;
+    friend T operator<=>(const Cmp&, const Cmp&) = default; // both-note {{deleted}}
+  };
+
+  template<typename T> constexpr auto cmp(Result equal, Result less = Mu, Result reverse_less = Mu) {
+    return Cmp<T>{equal, less} <=> Cmp<T>{Mu, reverse_less};
+  }
+
+  static_assert(cmp<std::strong_ordering>(True) == 0);
+  static_assert(cmp<std::strong_ordering>(False, True) < 0);
+  static_assert(cmp<std::strong_ordering>(False, False) > 0);
+
+  static_assert(cmp<std::weak_ordering>(True) == 0);
+  static_assert(cmp<std::weak_ordering>(False, True) < 0);
+  static_assert(cmp<std::weak_ordering>(False, False) > 0);
+
+  static_assert(cmp<std::partial_ordering>(True) == 0);
+  static_assert(cmp<std::partial_ordering>(False, True) < 0);
+  static_assert(cmp<std::partial_ordering>(False, False, True) > 0);
+  static_assert(!(cmp<std::partial_ordering>(False, False, False) > 0));
+  static_assert(!(cmp<std::partial_ordering>(False, False, False) == 0));
+  static_assert(!(cmp<std::partial_ordering>(False, False, False) < 0));
+
+  // No synthesis is performed for a custom return type, even if it can be
+  // converted from a standard ordering.
+  struct custom_ordering {
+    custom_ordering(std::strong_ordering o);
+  };
+  void f(Cmp<custom_ordering> c) {
+    c <=> c; // both-error {{deleted}}
+  }
+}
+
+namespace Preference {
+  struct A {
+    A(const A&) = delete; // both-note {{deleted}}
+    // "usable" candidate that can't actually be called
+    friend void operator<=>(A, A); // both-note {{passing}}
+    // Callable candidates for synthesis not considered.
+    friend bool operator==(A, A);
+    friend bool operator<(A, A);
+  };
+
+  struct B {
+    B();
+    A a;
+    std::strong_ordering operator<=>(const B&) const = default; // both-error {{call to deleted constructor of 'A'}}
+  };
+  bool x = B() < B(); // both-note {{in defaulted three-way comparison operator for 'B' first required here}}
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
index ac16a31293f3..80a9a263dab1 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
@@ -199,6 +199,8 @@ public:
   bool trivial23() const { return OptionSet<Flags>::fromRaw(v).contains(Flags::Flag1); }
   int trivial24() const { ASSERT(v); return v; }
   unsigned trivial25() const { return __c11_atomic_load((volatile _Atomic(unsigned) *)&v, __ATOMIC_RELAXED); }
+  bool trivial26() { bool hasValue = v; return !hasValue; }
+  bool trivial27(int v) { bool value; value = v ? 1 : 0; return value; }
 
   static RefCounted& singleton() {
     static RefCounted s_RefCounted;
@@ -262,6 +264,15 @@ public:
     return __c11_atomic_load((volatile _Atomic(unsigned) *)another(), __ATOMIC_RELAXED);
   }
 
+  void nonTrivial11() {
+    Number num(0.3);
+  }
+
+  bool nonTrivial12() {
+    bool val = otherFunction();
+    return val;
+  }
+
   unsigned v { 0 };
   Number* number { nullptr };
   Enum enumValue { Enum::Value1 };
@@ -309,6 +320,8 @@ public:
     getFieldTrivial().trivial23(); // no-warning
     getFieldTrivial().trivial24(); // no-warning
     getFieldTrivial().trivial25(); // no-warning
+    getFieldTrivial().trivial26(); // no-warning
+    getFieldTrivial().trivial27(5); // no-warning
     RefCounted::singleton().trivial18(); // no-warning
     RefCounted::singleton().someFunction(); // no-warning
 
@@ -334,6 +347,10 @@ public:
     // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
     getFieldTrivial().nonTrivial10();
     // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    getFieldTrivial().nonTrivial11();
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    getFieldTrivial().nonTrivial12();
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
   }
 };
 
diff --git a/clang/test/CodeGen/Mips/inline-asm-constraints.c b/clang/test/CodeGen/Mips/inline-asm-constraints.c
new file mode 100644
index 000000000000..88afe8735083
--- /dev/null
+++ b/clang/test/CodeGen/Mips/inline-asm-constraints.c
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -emit-llvm -triple mips -target-feature +soft-float %s -o - | FileCheck %s --check-prefix=SOFT_FLOAT
+
+// SOFT_FLOAT: call void asm sideeffect "", "r,~{$1}"(float %1)
+void read_float(float *p) {
+  __asm__("" ::"r"(*p));
+}
+
+// SOFT_FLOAT: call void asm sideeffect "", "r,~{$1}"(double %1)
+void read_double(double *p) {
+  __asm__("" :: "r"(*p));
+}
diff --git a/clang/test/CodeGen/builtins.c b/clang/test/CodeGen/builtins.c
index 88282120283b..73866116e07e 100644
--- a/clang/test/CodeGen/builtins.c
+++ b/clang/test/CodeGen/builtins.c
@@ -940,4 +940,47 @@ void test_builtin_os_log_long_double(void *buf, long double ld) {
 // CHECK: %[[V3:.*]] = load i128, ptr %[[ARG0_ADDR]], align 16
 // CHECK: store i128 %[[V3]], ptr %[[ARGDATA]], align 1
 
+// CHECK-LABEL: define{{.*}} void @test_builtin_popcountg
+void test_builtin_popcountg(unsigned char uc, unsigned short us,
+                            unsigned int ui, unsigned long ul,
+                            unsigned long long ull, unsigned __int128 ui128,
+                            unsigned _BitInt(128) ubi128) {
+  volatile int pop;
+  pop = __builtin_popcountg(uc);
+  // CHECK: %1 = load i8, ptr %uc.addr, align 1
+  // CHECK-NEXT: %conv = zext i8 %1 to i32
+  // CHECK-NEXT: %2 = call i32 @llvm.ctpop.i32(i32 %conv)
+  // CHECK-NEXT: store volatile i32 %2, ptr %pop, align 4
+  pop = __builtin_popcountg(us);
+  // CHECK-NEXT: %3 = load i16, ptr %us.addr, align 2
+  // CHECK-NEXT: %conv1 = zext i16 %3 to i32
+  // CHECK-NEXT: %4 = call i32 @llvm.ctpop.i32(i32 %conv1)
+  // CHECK-NEXT: store volatile i32 %4, ptr %pop, align 4
+  pop = __builtin_popcountg(ui);
+  // CHECK-NEXT: %5 = load i32, ptr %ui.addr, align 4
+  // CHECK-NEXT: %6 = call i32 @llvm.ctpop.i32(i32 %5)
+  // CHECK-NEXT: store volatile i32 %6, ptr %pop, align 4
+  pop = __builtin_popcountg(ul);
+  // CHECK-NEXT: %7 = load i64, ptr %ul.addr, align 8
+  // CHECK-NEXT: %8 = call i64 @llvm.ctpop.i64(i64 %7)
+  // CHECK-NEXT: %cast = trunc i64 %8 to i32
+  // CHECK-NEXT: store volatile i32 %cast, ptr %pop, align 4
+  pop = __builtin_popcountg(ull);
+  // CHECK-NEXT: %9 = load i64, ptr %ull.addr, align 8
+  // CHECK-NEXT: %10 = call i64 @llvm.ctpop.i64(i64 %9)
+  // CHECK-NEXT: %cast2 = trunc i64 %10 to i32
+  // CHECK-NEXT: store volatile i32 %cast2, ptr %pop, align 4
+  pop = __builtin_popcountg(ui128);
+  // CHECK-NEXT: %11 = load i128, ptr %ui128.addr, align 16
+  // CHECK-NEXT: %12 = call i128 @llvm.ctpop.i128(i128 %11)
+  // CHECK-NEXT: %cast3 = trunc i128 %12 to i32
+  // CHECK-NEXT: store volatile i32 %cast3, ptr %pop, align 4
+  pop = __builtin_popcountg(ubi128);
+  // CHECK-NEXT: %13 = load i128, ptr %ubi128.addr, align 8
+  // CHECK-NEXT: %14 = call i128 @llvm.ctpop.i128(i128 %13)
+  // CHECK-NEXT: %cast4 = trunc i128 %14 to i32
+  // CHECK-NEXT: store volatile i32 %cast4, ptr %pop, align 4
+  // CHECK-NEXT: ret void
+}
+
 #endif
diff --git a/clang/test/CodeGen/tbaa-struct.cpp b/clang/test/CodeGen/tbaa-struct.cpp
index e25fbc1a7781..883c982be26c 100644
--- a/clang/test/CodeGen/tbaa-struct.cpp
+++ b/clang/test/CodeGen/tbaa-struct.cpp
@@ -134,6 +134,23 @@ void copy9(NamedBitfields2 *a1, NamedBitfields2 *a2) {
   *a1 = *a2;
 }
 
+// Test with unnamed bitfield at the start and in between named ones..
+struct NamedBitfields3 {
+  unsigned : 11;
+  signed f0 : 9;
+  char : 2;
+  int f1 : 2;
+  double f2;
+};
+
+void copy10(NamedBitfields3 *a1, NamedBitfields3 *a2) {
+// CHECK-LABEL: _Z6copy10P15NamedBitfields3S0_
+// CHECK:   tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) %a1, ptr noundef nonnull align 8 dereferenceable(16) %a2, i64 16, i1 false),
+// CHECK-OLD-SAME: !tbaa.struct [[TS8:!.*]]
+// CHECK-NEW-SAME: !tbaa [[TAG_NamedBitfields3:!.+]], !tbaa.struct
+  *a1 = *a2;
+}
+
 // CHECK-OLD: [[TS]] = !{i64 0, i64 2, !{{.*}}, i64 4, i64 4, !{{.*}}, i64 8, i64 1, !{{.*}}, i64 12, i64 4, !{{.*}}}
 // CHECK-OLD: [[CHAR:!.*]] = !{!"omnipotent char", !{{.*}}}
 // CHECK-OLD: [[TAG_INT:!.*]] = !{[[INT:!.*]], [[INT]], i64 0}
@@ -145,10 +162,11 @@ void copy9(NamedBitfields2 *a1, NamedBitfields2 *a2) {
 // CHECK-OLD: [[TS3]] = !{i64 0, i64 8, !{{.*}}, i64 0, i64 2, !{{.*}}, i64 4, i64 8, !{{.*}}}
 // CHECK-OLD: [[TS4]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 1, i64 1, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]]}
 // CHECK-OLD: [[TS5]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 4, i64 1, [[TAG_CHAR]], i64 5, i64 1, [[TAG_CHAR]]}
-// CHECK-OLD: [[TS6]] = !{i64 0, i64 4, [[TAG_INT]], i64 1, i64 4, [[TAG_INT]], i64 2, i64 1, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE:!.+]]}
+// CHECK-OLD: [[TS6]] = !{i64 0, i64 2, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE:!.+]]}
 // CHECK-OLD: [[TAG_DOUBLE]] = !{[[DOUBLE:!.+]], [[DOUBLE]], i64 0}
 // CHECK-OLD  [[DOUBLE]] = !{!"double", [[CHAR]], i64 0}
-// CHECK-OLD: [[TS7]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 1, i64 1, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]], i64 3, i64 4, [[TAG_INT]], i64 3, i64 4, [[TAG_INT]], i64 4, i64 1, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE]], i64 16, i64 4, [[TAG_INT]]}
+// CHECK-OLD: [[TS7]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 1, i64 1, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]], i64 3, i64 1, [[TAG_CHAR]], i64 4, i64 1, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE]], i64 16, i64 1, [[TAG_CHAR]]}
+// CHECK-OLD: [[TS8]] = !{i64 0, i64 4, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE]]}
 
 // CHECK-NEW-DAG: [[TYPE_char:!.*]] = !{{{.*}}, i64 1, !"omnipotent char"}
 // CHECK-NEW-DAG: [[TAG_char]] = !{[[TYPE_char]], [[TYPE_char]], i64 0, i64 0}
@@ -168,3 +186,5 @@ void copy9(NamedBitfields2 *a1, NamedBitfields2 *a2) {
 // CHECK-NEW-DAG: [[TYPE_double]] = !{[[TYPE_char]], i64 8, !"double"}
 // CHECK-NEW-DAG: [[TAG_NamedBitfields2]] = !{[[TYPE_NamedBitfields2:!.+]], [[TYPE_NamedBitfields2]], i64 0, i64 24}
 // CHECK-NEW-DAG: [[TYPE_NamedBitfields2]] = !{[[TYPE_char]], i64 24, !"_ZTS15NamedBitfields2", [[TYPE_char]], i64 0, i64 1, [[TYPE_char]], i64 1, i64 1, [[TYPE_char]], i64 2, i64 1, [[TYPE_int]], i64 3, i64 4, [[TYPE_int]], i64 3, i64 4, [[TYPE_char]], i64 4, i64 1, [[TYPE_double]], i64 8, i64 8, [[TYPE_int]], i64 16, i64 4}
+// CHECK-NEW-DAG: [[TAG_NamedBitfields3]] = !{[[TYPE_NamedBitfields3:!.+]], [[TYPE_NamedBitfields3]], i64 0, i64 16}
+// CHECK-NEW-DAG: [[TYPE_NamedBitfields3]] = !{[[TYPE_char]], i64 16, !"_ZTS15NamedBitfields3", [[TYPE_int]], i64 1, i64 4, [[TYPE_int]], i64 2, i64 4, [[TYPE_double]], i64 8, i64 8}
diff --git a/clang/test/CodeGenHLSL/semantics/GroupIndex-codegen.hlsl b/clang/test/CodeGenHLSL/semantics/GroupIndex-codegen.hlsl
index b8514b0d13f1..7e7ebe930bd9 100644
--- a/clang/test/CodeGenHLSL/semantics/GroupIndex-codegen.hlsl
+++ b/clang/test/CodeGenHLSL/semantics/GroupIndex-codegen.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -disable-llvm-passes -o - -hlsl-entry main %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -disable-llvm-passes -o - -hlsl-entry main %s | FileCheck %s
 
 [numthreads(1,1,1)]
 void main(unsigned GI : SV_GroupIndex) {
@@ -10,7 +10,7 @@ void main(unsigned GI : SV_GroupIndex) {
 // semantic parameters and provides the expected void(void) signature that
 // drivers expect for entry points.
 
-//CHECK: define void @main() #[[ENTRY_ATTR:#]]{
+//CHECK: define void @main() #[[#ENTRY_ATTR:]] {
 //CHECK-NEXT: entry:
 //CHECK-NEXT:   %0 = call i32 @llvm.dx.flattened.thread.id.in.group()
 //CHECK-NEXT:   call void @"?main@@YAXI@Z"(i32 %0)
@@ -19,4 +19,4 @@ void main(unsigned GI : SV_GroupIndex) {
 
 // Verify that the entry had the expected dx.shader attribute
 
-//CHECK: attributes #[[ENTRY_ATTR]] = { {{.*}}"dx.shader"="compute"{{.*}} }
+//CHECK: attributes #[[#ENTRY_ATTR]] = { {{.*}}"hlsl.shader"="compute"{{.*}} }
diff --git a/clang/test/CoverageMapping/single-byte-counters.cpp b/clang/test/CoverageMapping/single-byte-counters.cpp
new file mode 100644
index 000000000000..8e9b613dcc68
--- /dev/null
+++ b/clang/test/CoverageMapping/single-byte-counters.cpp
@@ -0,0 +1,169 @@
+// RUN: %clang_cc1 -mllvm -emptyline-comment-coverage=false -mllvm -enable-single-byte-coverage=true -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name single-byte-counters.cpp %s | FileCheck %s
+
+// CHECK: testIf
+int testIf(int x) { // CHECK-NEXT: File 0, [[@LINE]]:19 -> [[@LINE+10]]:2 = #0
+                    // CHECK-NEXT: File 0, [[@LINE+5]]:7 -> [[@LINE+5]]:13 = #0
+                    // CHECK-NEXT: Gap,File 0, [[@LINE+4]]:14 -> [[@LINE+5]]:5 = #1
+                    // CHECK-NEXT: File 0, [[@LINE+4]]:5 -> [[@LINE+4]]:16 = #1
+                    // CHECK-NEXT: File 0, [[@LINE+5]]:3 -> [[@LINE+5]]:16 = #2
+  int result = 0;
+  if (x == 0)
+    result = -1;
+
+  return result;
+}
+
+// CHECK-NEXT: testIfElse
+int testIfElse(int x) { // CHECK-NEXT: File 0, [[@LINE]]:23 -> [[@LINE+13]]:2 = #0
+                        // CHECK-NEXT: File 0, [[@LINE+7]]:7 -> [[@LINE+7]]:12 = #0
+                        // CHECK-NEXT: Gap,File 0, [[@LINE+6]]:13 -> [[@LINE+7]]:5 = #1
+                        // CHECK-NEXT: File 0, [[@LINE+6]]:5 -> [[@LINE+6]]:15 = #1
+                        // CHECK-NEXT: Gap,File 0, [[@LINE+5]]:16 -> [[@LINE+7]]:5 = #2
+                        // CHECK-NEXT: File 0, [[@LINE+6]]:5 -> [[@LINE+6]]:19 = #2
+                        // CHECK-NEXT: File 0, [[@LINE+6]]:3 -> [[@LINE+6]]:16 = #3
+  int result = 0;
+  if (x < 0)
+    result = 0;
+  else
+    result = x * x;
+  return result;
+}
+
+// CHECK-NEXT: testIfElseReturn
+int testIfElseReturn(int x) { // CHECK-NEXT: File 0, [[@LINE]]:29 -> [[@LINE+14]]:2 = #0
+                              // CHECK-NEXT: File 0, [[@LINE+8]]:7 -> [[@LINE+8]]:12 = #0
+                              // CHECK-NEXT: Gap,File 0, [[@LINE+7]]:13 -> [[@LINE+8]]:5 = #1
+                              // CHECK-NEXT: File 0, [[@LINE+7]]:5 -> [[@LINE+7]]:19 = #1
+                              // CHECK-NEXT: Gap,File 0, [[@LINE+6]]:20 -> [[@LINE+8]]:5 = #2
+                              // CHECK-NEXT: File 0, [[@LINE+7]]:5 -> [[@LINE+7]]:13 = #2
+                              // CHECK-NEXT: Gap,File 0, [[@LINE+6]]:14 -> [[@LINE+7]]:3 = #3
+                              // CHECK-NEXT: File 0, [[@LINE+6]]:3 -> [[@LINE+6]]:16 = #3
+  int result = 0;
+  if (x > 0)
+    result = x * x;
+  else
+    return 0;
+  return result;
+}
+
+// CHECK-NEXT: testSwitch
+int testSwitch(int x) { // CHECK-NEXT: File 0, [[@LINE]]:23 -> [[@LINE+22]]:2 = #0
+                        // CHECK-NEXT: Gap,File 0, [[@LINE+9]]:14 -> [[@LINE+17]]:15 = 0
+                        // CHECK-NEXT: File 0, [[@LINE+9]]:3 -> [[@LINE+11]]:10 = #2
+                        // CHECK-NEXT: Gap,File 0, [[@LINE+10]]:11 -> [[@LINE+11]]:3 = 0
+                        // CHECK-NEXT: File 0, [[@LINE+10]]:3 -> [[@LINE+12]]:10 = #3
+                        // CHECK-NEXT: Gap,File 0, [[@LINE+11]]:11 -> [[@LINE+12]]:3 = 0
+                        // CHECK-NEXT: File 0, [[@LINE+11]]:3 -> [[@LINE+12]]:15 = #4
+                        // CHECK-NEXT: Gap,File 0, [[@LINE+12]]:4 -> [[@LINE+14]]:3 = #1
+                        // CHECK-NEXT: File 0, [[@LINE+13]]:3 -> [[@LINE+13]]:16 = #1
+  int result;
+  switch (x) {
+  case 1:
+    result = 1;
+    break;
+  case 2:
+    result = 2;
+    break;
+  default:
+    result = 0;
+  }
+
+  return result;
+}
+
+// CHECK-NEXT: testWhile
+int testWhile() { // CHECK-NEXT: File 0, [[@LINE]]:17 -> [[@LINE+13]]:2 = #0
+                  // CHECK-NEXT: File 0, [[@LINE+6]]:10 -> [[@LINE+6]]:16 = #1
+                  // CHECK-NEXT: Gap,File 0, [[@LINE+5]]:17 -> [[@LINE+5]]:18 = #2
+                  // CHECK-NEXT: File 0, [[@LINE+4]]:18 -> [[@LINE+7]]:4 = #2
+                  // CHECK-NEXT: File 0, [[@LINE+8]]:3 -> [[@LINE+8]]:13 = #3
+  int i = 0;
+  int sum = 0;
+  while (i < 10) {
+    sum += i;
+    i++;
+  }
+
+  return sum;
+}
+
+// CHECK-NEXT: testContinue
+int testContinue() { // CHECK-NEXT: File 0, [[@LINE]]:20 -> [[@LINE+21]]:2 = #0
+                     // CHECK-NEXT: File 0, [[@LINE+12]]:10 -> [[@LINE+12]]:16 = #1
+                     // CHECK-NEXT: Gap,File 0, [[@LINE+11]]:17 -> [[@LINE+11]]:18 = #2
+                     // CHECK-NEXT: File 0, [[@LINE+10]]:18 -> [[@LINE+15]]:4 = #2
+                     // CHECK-NEXT: File 0, [[@LINE+10]]:9 -> [[@LINE+10]]:15 = #2
+                     // CHECK-NEXT: Gap,File 0, [[@LINE+9]]:16 -> [[@LINE+10]]:7 = #4
+                     // CHECK-NEXT: File 0, [[@LINE+9]]:7 -> [[@LINE+9]]:15 = #4
+                     // CHECK-NEXT: Gap,File 0, [[@LINE+8]]:16 -> [[@LINE+9]]:5 = #5
+                     // CHECK-NEXT: File 0, [[@LINE+8]]:5 -> [[@LINE+10]]:4 = #5
+                     // CHECK-NEXT: Gap,File 0, [[@LINE+9]]:4 -> [[@LINE+11]]:3 = #3
+                     // CHECK-NEXT: File 0, [[@LINE+10]]:3 -> [[@LINE+10]]:13 = #3
+  int i = 0;
+  int sum = 0;
+  while (i < 10) {
+    if (i == 4)
+      continue;
+    sum += i;
+    i++;
+  }
+
+  return sum;
+}
+
+// CHECK-NEXT: testFor
+int testFor() { // CHECK-NEXT: File 0, [[@LINE]]:15 -> [[@LINE+13]]:2 = #0
+                // CHECK-NEXT: File 0, [[@LINE+7]]:19 -> [[@LINE+7]]:25 = #1
+                // CHECK-NEXT: File 0, [[@LINE+6]]:27 -> [[@LINE+6]]:30 = #2
+                // CHECK-NEXT: Gap,File 0, [[@LINE+5]]:31 -> [[@LINE+5]]:32 = #3
+                // CHECK-NEXT: File 0, [[@LINE+4]]:32 -> [[@LINE+6]]:4 = #3
+                // CHECK-NEXT: File 0, [[@LINE+7]]:3 -> [[@LINE+7]]:13 = #4
+  int i;
+  int sum = 0;
+  for (int i = 0; i < 10; i++) {
+    sum += i;
+  }
+
+  return sum;
+}
+
+// CHECK-NEXT: testForRange
+int testForRange() { // CHECK-NEXT: File 0, [[@LINE]]:20 -> [[@LINE+12]]:2 = #0
+                     // CHECK-NEXT: Gap,File 0, [[@LINE+6]]:28 -> [[@LINE+6]]:29 = #1
+                     // CHECK-NEXT: File 0, [[@LINE+5]]:29 -> [[@LINE+7]]:4 = #1
+                     // CHECK-NEXT: File 0, [[@LINE+8]]:3 -> [[@LINE+8]]:13 = #2
+  int sum = 0;
+  int array[] = {1, 2, 3, 4, 5};
+
+  for (int element : array) {
+      sum += element;
+  }
+
+  return sum;
+}
+
+// CHECK-NEXT: testDo
+int testDo() { // CHECK-NEXT: File 0, [[@LINE]]:14 -> [[@LINE+12]]:2 = #0
+               // CHECK-NEXT: File 0, [[@LINE+5]]:6 -> [[@LINE+8]]:4 = #1
+               // CHECK-NEXT: File 0, [[@LINE+7]]:12 -> [[@LINE+7]]:17 = #2
+               // CHECK-NEXT: File 0, [[@LINE+8]]:3 -> [[@LINE+8]]:13 = #3
+  int i = 0;
+  int sum = 0;
+  do {
+    sum += i;
+    i++;
+  } while (i < 5);
+
+  return sum;
+}
+
+// CHECK-NEXT: testConditional
+int testConditional(int x) { // CHECK-NEXT: File 0, [[@LINE]]:28 -> [[@LINE+8]]:2 = #0
+                             // CHECK-NEXT: File 0, [[@LINE+5]]:15 -> [[@LINE+5]]:22 = #0
+                             // CHECK-NEXT: Gap,File 0, [[@LINE+4]]:24 -> [[@LINE+4]]:25 = #2
+                             // CHECK-NEXT: File 0, [[@LINE+3]]:25 -> [[@LINE+3]]:26 = #2
+                             // CHECK-NEXT: File 0, [[@LINE+2]]:29 -> [[@LINE+2]]:31 = #3
+                             // CHECK-NEXT: File 0, [[@LINE+2]]:2 -> [[@LINE+2]]:15 = #1
+ int result = (x > 0) ? 1 : -1;
+ return result;
+}
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.asan-i386.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/aarch64-unknown-linux/libclang_rt.hwasan.a
index e69de29bb2d1..e69de29bb2d1 100644
--- a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.asan-i386.a.syms
+++ b/clang/test/Driver/Inputs/resource_dir/lib/aarch64-unknown-linux/libclang_rt.hwasan.a
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.asan-x86_64.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/aarch64-unknown-linux/libclang_rt.hwasan.a.syms
index e69de29bb2d1..e69de29bb2d1 100644
--- a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.asan-x86_64.a.syms
+++ b/clang/test/Driver/Inputs/resource_dir/lib/aarch64-unknown-linux/libclang_rt.hwasan.a.syms
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.hwasan-aarch64.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/i386-unknown-linux/libclang_rt.asan.a
index e69de29bb2d1..e69de29bb2d1 100644
--- a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.hwasan-aarch64.a.syms
+++ b/clang/test/Driver/Inputs/resource_dir/lib/i386-unknown-linux/libclang_rt.asan.a
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.hwasan-x86_64.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/i386-unknown-linux/libclang_rt.asan.a.syms
index e69de29bb2d1..e69de29bb2d1 100644
--- a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.hwasan-x86_64.a.syms
+++ b/clang/test/Driver/Inputs/resource_dir/lib/i386-unknown-linux/libclang_rt.asan.a.syms
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.msan-x86_64.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/i686-unknown-linux/clang_rt.crtbegin.o
index e69de29bb2d1..e69de29bb2d1 100644
--- a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.msan-x86_64.a.syms
+++ b/clang/test/Driver/Inputs/resource_dir/lib/i686-unknown-linux/clang_rt.crtbegin.o
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.msan_cxx-x86_64.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/i686-unknown-linux/clang_rt.crtend.o
index e69de29bb2d1..e69de29bb2d1 100644
--- a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.msan_cxx-x86_64.a.syms
+++ b/clang/test/Driver/Inputs/resource_dir/lib/i686-unknown-linux/clang_rt.crtend.o
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.tsan-x86_64.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/clang_rt.crtbegin.o
index e69de29bb2d1..e69de29bb2d1 100644
--- a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.tsan-x86_64.a.syms
+++ b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/clang_rt.crtbegin.o
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.tsan_cxx-x86_64.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/clang_rt.crtend.o
index e69de29bb2d1..e69de29bb2d1 100644
--- a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.tsan_cxx-x86_64.a.syms
+++ b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/clang_rt.crtend.o
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.ubsan-i386.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.asan.a
index e69de29bb2d1..e69de29bb2d1 100644
--- a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.ubsan-i386.a.syms
+++ b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.asan.a
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.ubsan-x86_64.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.asan.a.syms
index e69de29bb2d1..e69de29bb2d1 100644
--- a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.ubsan-x86_64.a.syms
+++ b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.asan.a.syms
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.ubsan_cxx-i386.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.hwasan.a
index e69de29bb2d1..e69de29bb2d1 100644
--- a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.ubsan_cxx-i386.a.syms
+++ b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.hwasan.a
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.ubsan_cxx-x86_64.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.hwasan.a.syms
index e69de29bb2d1..e69de29bb2d1 100644
--- a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.ubsan_cxx-x86_64.a.syms
+++ b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.hwasan.a.syms
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.msan.a b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.msan.a
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.msan.a
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.msan.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.msan.a.syms
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.msan.a.syms
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.msan_cxx.a b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.msan_cxx.a
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.msan_cxx.a
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.msan_cxx.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.msan_cxx.a.syms
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.msan_cxx.a.syms
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.tsan.a b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.tsan.a
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.tsan.a
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.tsan.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.tsan.a.syms
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.tsan.a.syms
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.tsan_cxx.a b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.tsan_cxx.a
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.tsan_cxx.a
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.tsan_cxx.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.tsan_cxx.a.syms
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.tsan_cxx.a.syms
diff --git a/clang/test/Driver/arch-specific-libdir.c b/clang/test/Driver/arch-specific-libdir.c
index be643d2c8b8a..162f9e424126 100644
--- a/clang/test/Driver/arch-specific-libdir.c
+++ b/clang/test/Driver/arch-specific-libdir.c
@@ -49,4 +49,4 @@
 //
 // Have a stricter check for no-archdir - that the driver doesn't add any
 // subdirectory from the provided resource directory.
-// NO-ARCHDIR-NOT: -L[[FILE_PATH]]/Inputs/resource_dir
+// NO-ARCHDIR-NOT: -L[[FILE_PATH]]/Inputs/resource_dir"
diff --git a/clang/test/Driver/arm-compiler-rt.c b/clang/test/Driver/arm-compiler-rt.c
index adecacbcaabf..5e9e528400d0 100644
--- a/clang/test/Driver/arm-compiler-rt.c
+++ b/clang/test/Driver/arm-compiler-rt.c
@@ -3,7 +3,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir_with_arch_subdir \
 // RUN:     -rtlib=compiler-rt -### %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix ARM-EABI
-// ARM-EABI: "{{[^"]*}}libclang_rt.builtins-arm.a"
+// ARM-EABI: "{{[^"]*}}libclang_rt.builtins.a"
 
 // RUN: %clang -target arm-linux-gnueabi \
 // RUN:     --sysroot=%S/Inputs/resource_dir_with_arch_subdir \
diff --git a/clang/test/Driver/baremetal-sysroot.cpp b/clang/test/Driver/baremetal-sysroot.cpp
index 46338185ffd9..bbc608809d0e 100644
--- a/clang/test/Driver/baremetal-sysroot.cpp
+++ b/clang/test/Driver/baremetal-sysroot.cpp
@@ -18,5 +18,5 @@
 // CHECK-V6M-C-SAME: "-x" "c++" "{{.*}}baremetal-sysroot.cpp"
 // CHECK-V6M-C-NEXT: "{{[^"]*}}ld{{(\.(lld|bfd|gold))?}}{{(\.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-V6M-C-SAME: "-L{{.*}}/baremetal_default_sysroot{{[/\\]+}}bin{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+}}armv6m-none-eabi{{[/\\]+}}lib"
-// CHECK-V6M-C-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-armv6m.a"
+// CHECK-V6M-C-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-V6M-C-SAME: "-o" "{{.*}}.o"
diff --git a/clang/test/Driver/baremetal.cpp b/clang/test/Driver/baremetal.cpp
index 8baf388894eb..657611bb3f38 100644
--- a/clang/test/Driver/baremetal.cpp
+++ b/clang/test/Driver/baremetal.cpp
@@ -18,7 +18,7 @@
 // CHECK-V6M-C-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-C-SAME: "-T" "semihosted.lds" "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for"
 // CHECK-V6M-C-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
-// CHECK-V6M-C-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-armv6m.a" "--target2=rel" "-o" "{{.*}}.tmp.out"
+// CHECK-V6M-C-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "--target2=rel" "-o" "{{.*}}.tmp.out"
 
 // RUN: %clang %s -### --target=armv6m-none-eabi -nostdlibinc -nobuiltininc 2>&1 \
 // RUN:     --sysroot=%S/Inputs/baremetal_arm | FileCheck --check-prefix=CHECK-V6M-LIBINC %s
@@ -44,7 +44,7 @@
 // CHECK-V6M-DEFAULTCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
 // CHECK-V6M-DEFAULTCXX-SAME: "-lc++" "-lc++abi" "-lunwind"
-// CHECK-V6M-DEFAULTCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-armv6m.a" "--target2=rel" "-o" "a.out"
+// CHECK-V6M-DEFAULTCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "--target2=rel" "-o" "a.out"
 
 // RUN: %clangxx %s -### --target=armv6m-none-eabi -stdlib=libc++ 2>&1 \
 // RUN:     --sysroot=%S/Inputs/baremetal_arm | FileCheck --check-prefix=CHECK-V6M-LIBCXX %s
@@ -54,7 +54,7 @@
 // CHECK-V6M-LIBCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
 // CHECK-V6M-LIBCXX-SAME: "-lc++" "-lc++abi" "-lunwind"
-// CHECK-V6M-LIBCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-armv6m.a" "--target2=rel" "-o" "a.out"
+// CHECK-V6M-LIBCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "--target2=rel" "-o" "a.out"
 
 // RUN: %clangxx %s -### --target=armv6m-none-eabi 2>&1 \
 // RUN:     --sysroot=%S/Inputs/baremetal_arm \
@@ -66,7 +66,7 @@
 // CHECK-V6M-LIBSTDCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
 // CHECK-V6M-LIBSTDCXX-SAME: "-lstdc++" "-lsupc++" "-lunwind"
-// CHECK-V6M-LIBSTDCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-armv6m.a" "--target2=rel" "-o" "a.out"
+// CHECK-V6M-LIBSTDCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "--target2=rel" "-o" "a.out"
 
 // RUN: %clangxx %s -### --target=armv6m-none-eabi 2>&1 \
 // RUN:     --sysroot=%S/Inputs/baremetal_arm \
@@ -89,7 +89,7 @@
 // CHECK-V6M-LIBCXX-USR: "{{[^"]*}}-Bstatic"
 // CHECK-V6M-LIBCXX-USR-SAME: "-L{{[^"]*}}{{[/\\]+}}baremetal_cxx_sysroot{{[/\\]+}}lib"
 // CHECK-V6M-LIBCXX-USR-SAME: "-lc++" "-lc++abi" "-lunwind"
-// CHECK-V6M-LIBCXX-USR-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-armv6m.a"
+// CHECK-V6M-LIBCXX-USR-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a"
 
 // RUN: %clangxx --target=arm-none-eabi -v 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-THREAD-MODEL
@@ -172,7 +172,7 @@
 // CHECK-RV64-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64-SAME: "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for"
 // CHECK-RV64-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
-// CHECK-RV64-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv64.a" "-X" "-o" "{{.*}}.tmp.out"
+// CHECK-RV64-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "-X" "-o" "{{.*}}.tmp.out"
 
 // RUN: %clangxx %s -### --target=riscv64-unknown-elf 2>&1 \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \
@@ -181,7 +181,7 @@
 // CHECK-RV64-DEFAULTCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
 // CHECK-RV64-DEFAULTCXX-SAME: "-lc++" "-lc++abi" "-lunwind"
-// CHECK-RV64-DEFAULTCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv64.a" "-X" "-o" "a.out"
+// CHECK-RV64-DEFAULTCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "-X" "-o" "a.out"
 
 // RUN: %clangxx %s -### --target=riscv64-unknown-elf 2>&1 \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \
@@ -193,7 +193,7 @@
 // CHECK-RV64-LIBCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
 // CHECK-RV64-LIBCXX-SAME: "-lc++" "-lc++abi" "-lunwind"
-// CHECK-RV64-LIBCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv64.a" "-X" "-o" "a.out"
+// CHECK-RV64-LIBCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "-X" "-o" "a.out"
 
 // RUN: %clangxx %s -### 2>&1 --target=riscv64-unknown-elf \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \
@@ -205,7 +205,7 @@
 // CHECK-RV64-LIBSTDCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
 // CHECK-RV64-LIBSTDCXX-SAME: "-lstdc++" "-lsupc++" "-lunwind"
-// CHECK-RV64-LIBSTDCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv64.a" "-X" "-o" "a.out"
+// CHECK-RV64-LIBSTDCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "-X" "-o" "a.out"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     -L some/directory/user/asked/for \
@@ -220,7 +220,7 @@
 // CHECK-RV32-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32-SAME: "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for"
 // CHECK-RV32-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
-// CHECK-RV32-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv32.a" "-X" "-o" "a.out"
+// CHECK-RV32-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "-X" "-o" "a.out"
 
 // RUN: %clangxx %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \
@@ -229,7 +229,7 @@
 // CHECK-RV32-DEFAULTCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib"
 // CHECK-RV32-DEFAULTCXX-SAME: "-lc++" "-lc++abi" "-lunwind"
-// CHECK-RV32-DEFAULTCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv32.a" "-X" "-o" "a.out"
+// CHECK-RV32-DEFAULTCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "-X" "-o" "a.out"
 
 // RUN: %clangxx %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \
@@ -241,7 +241,7 @@
 // CHECK-RV32-LIBCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib"
 // CHECK-RV32-LIBCXX-SAME: "-lc++" "-lc++abi" "-lunwind"
-// CHECK-RV32-LIBCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv32.a" "-X" "-o" "a.out"
+// CHECK-RV32-LIBCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "-X" "-o" "a.out"
 
 // RUN: %clangxx %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \
@@ -253,7 +253,7 @@
 // CHECK-RV32-LIBSTDCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib"
 // CHECK-RV32-LIBSTDCXX-SAME: "-lstdc++" "-lsupc++" "-lunwind"
-// CHECK-RV32-LIBSTDCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv32.a" "-X" "-o" "a.out"
+// CHECK-RV32-LIBSTDCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "-X" "-o" "a.out"
 
 // RUN: %clang %s -### 2>&1 --target=riscv64-unknown-elf \
 // RUN:     -nostdlibinc -nobuiltininc \
@@ -375,7 +375,7 @@
 // CHECK-PPCEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
 // CHECK-PPCEABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-PPCEABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
-// CHECK-PPCEABI-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-powerpc.a" "-o" "a.out"
+// CHECK-PPCEABI-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "-o" "a.out"
 
 // RUN: %clang -no-canonical-prefixes %s -### --target=powerpc64-unknown-eabi 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PPC64EABI %s
@@ -387,7 +387,7 @@
 // CHECK-PPC64EABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
 // CHECK-PPC64EABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-PPC64EABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
-// CHECK-PPC64EABI-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-powerpc64.a" "-o" "a.out"
+// CHECK-PPC64EABI-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "-o" "a.out"
 
 // RUN: %clang -no-canonical-prefixes %s -### --target=powerpcle-unknown-eabi 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PPCLEEABI %s
@@ -399,7 +399,7 @@
 // CHECK-PPCLEEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
 // CHECK-PPCLEEABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-PPCLEEABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
-// CHECK-PPCLEEABI-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-powerpcle.a" "-o" "a.out"
+// CHECK-PPCLEEABI-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "-o" "a.out"
 
 // RUN: %clang -no-canonical-prefixes %s -### --target=powerpc64le-unknown-eabi 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PPC64LEEABI %s
@@ -411,7 +411,7 @@
 // CHECK-PPC64LEEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
 // CHECK-PPC64LEEABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-PPC64LEEABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
-// CHECK-PPC64LEEABI-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-powerpc64le.a" "-o" "a.out"
+// CHECK-PPC64LEEABI-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "-o" "a.out"
 
 // Check that compiler-rt library without the arch filename suffix will
 // be used if present.
@@ -423,7 +423,7 @@
 // RUN:     --sysroot=%T/baremetal_clang_rt_noarch \
 // RUN:   | FileCheck --check-prefix=CHECK-CLANGRT-NOARCH %s
 // CHECK-CLANGRT-NOARCH: "{{[^"]*}}libclang_rt.builtins.a"
-// CHECK-CLANGRT-NOARCH-NOT: "{{[^"]*}}libclang_rt.builtins-armv6m.a"
+// CHECK-CLANGRT-NOARCH-NOT: "{{[^"]*}}libclang_rt.builtins.a"
 
 // Check that compiler-rt library with the arch filename suffix will be
 // used if present.
@@ -434,7 +434,7 @@
 // RUN:     --target=armv6m-none-eabi \
 // RUN:     --sysroot=%T/baremetal_clang_rt_arch \
 // RUN:   | FileCheck --check-prefix=CHECK-CLANGRT-ARCH %s
-// CHECK-CLANGRT-ARCH: "{{[^"]*}}libclang_rt.builtins-armv6m.a"
+// CHECK-CLANGRT-ARCH: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-CLANGRT-ARCH-NOT: "{{[^"]*}}libclang_rt.builtins.a"
 
 // Check that "--no-relax" is forwarded to the linker for RISC-V.
diff --git a/clang/test/Driver/basic-block-address-map.c b/clang/test/Driver/basic-block-address-map.c
index 022f972b412d..12393e8ebfd5 100644
--- a/clang/test/Driver/basic-block-address-map.c
+++ b/clang/test/Driver/basic-block-address-map.c
@@ -1,8 +1,9 @@
-// RUN: %clang -### -target x86_64 -fbasic-block-address-map %s -S 2>&1 | FileCheck -check-prefix=CHECK-PRESENT %s
+// RUN: %clang -### --target=x86_64 -fbasic-block-address-map %s -S 2>&1 | FileCheck -check-prefix=CHECK-PRESENT %s
+// RUN: %clang -### --target=aarch64 -fbasic-block-address-map %s -S 2>&1 | FileCheck -check-prefix=CHECK-PRESENT %s
 // CHECK-PRESENT: -fbasic-block-address-map
 
-// RUN: %clang -### -target x86_64 -fno-basic-block-address-map %s -S 2>&1 | FileCheck %s --check-prefix=CHECK-ABSENT
+// RUN: %clang -### --target=x86_64 -fno-basic-block-address-map %s -S 2>&1 | FileCheck %s --check-prefix=CHECK-ABSENT
 // CHECK-ABSENT-NOT: -fbasic-block-address-map
 
-// RUN: not %clang -c -target x86_64-apple-darwin10 -fbasic-block-address-map %s -S 2>&1 | FileCheck -check-prefix=CHECK-TRIPLE %s
+// RUN: not %clang -c --target=x86_64-apple-darwin10 -fbasic-block-address-map %s -S 2>&1 | FileCheck -check-prefix=CHECK-TRIPLE %s
 // CHECK-TRIPLE: error: unsupported option '-fbasic-block-address-map' for target
diff --git a/clang/test/Driver/compiler-rt-unwind.c b/clang/test/Driver/compiler-rt-unwind.c
index adc09792a27f..4260dab9302c 100644
--- a/clang/test/Driver/compiler-rt-unwind.c
+++ b/clang/test/Driver/compiler-rt-unwind.c
@@ -52,7 +52,7 @@
 // RUN:     --target=x86_64-unknown-linux -rtlib=compiler-rt \
 // RUN:     --gcc-toolchain="" -resource-dir=%S/Inputs/resource_dir \
 // RUN:   | FileCheck --check-prefix=RTLIB-COMPILER-RT %s
-// RTLIB-COMPILER-RT: "{{.*}}libclang_rt.builtins-x86_64.a"
+// RTLIB-COMPILER-RT: "{{.*}}libclang_rt.builtins.a"
 //
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-linux -rtlib=compiler-rt --unwindlib=libunwind \
@@ -62,7 +62,7 @@
 // RUN:     --target=x86_64-unknown-linux -rtlib=compiler-rt --unwindlib=libunwind \
 // RUN:     --gcc-toolchain="" -resource-dir=%S/Inputs/resource_dir \
 // RUN:   | FileCheck --check-prefix=RTLIB-COMPILER-RT-UNWINDLIB-COMPILER-RT %s
-// RTLIB-COMPILER-RT-UNWINDLIB-COMPILER-RT: "{{.*}}libclang_rt.builtins-x86_64.a"
+// RTLIB-COMPILER-RT-UNWINDLIB-COMPILER-RT: "{{.*}}libclang_rt.builtins.a"
 // RTLIB-COMPILER-RT-UNWINDLIB-COMPILER-RT-SAME: "--as-needed"
 // RTLIB-COMPILER-RT-UNWINDLIB-COMPILER-RT-SAME: "-lunwind"
 // RTLIB-COMPILER-RT-UNWINDLIB-COMPILER-RT-SAME: "--no-as-needed"
@@ -71,14 +71,14 @@
 // RUN:     --target=x86_64-unknown-linux -rtlib=compiler-rt --unwindlib=libgcc \
 // RUN:     --gcc-toolchain="" -resource-dir=%S/Inputs/resource_dir \
 // RUN:   | FileCheck --check-prefix=RTLIB-COMPILER-RT-UNWINDLIB-GCC %s
-// RTLIB-COMPILER-RT-UNWINDLIB-GCC: "{{.*}}libclang_rt.builtins-x86_64.a"
+// RTLIB-COMPILER-RT-UNWINDLIB-GCC: "{{.*}}libclang_rt.builtins.a"
 // RTLIB-COMPILER-RT-UNWINDLIB-GCC-SAME: "-lgcc_s"
 //
 // RUN: %clang -### %s 2>&1              \
 // RUN:     --target=x86_64-unknown-linux -rtlib=compiler-rt --unwindlib=libgcc \
 // RUN:     -static --gcc-toolchain="" -resource-dir=%S/Inputs/resource_dir \
 // RUN:   | FileCheck --check-prefix=RTLIB-COMPILER-RT-UNWINDLIB-GCC-STATIC %s
-// RTLIB-COMPILER-RT-UNWINDLIB-GCC-STATIC: "{{.*}}libclang_rt.builtins-x86_64.a"
+// RTLIB-COMPILER-RT-UNWINDLIB-GCC-STATIC: "{{.*}}libclang_rt.builtins.a"
 // RTLIB-COMPILER-RT-UNWINDLIB-GCC-STATIC-SAME: "-lgcc_eh"
 //
 // RUN: not %clang %s 2> %t.err              \
diff --git a/clang/test/Driver/coverage-ld.c b/clang/test/Driver/coverage-ld.c
index e72bbf86bf4e..acb08eb5db59 100644
--- a/clang/test/Driver/coverage-ld.c
+++ b/clang/test/Driver/coverage-ld.c
@@ -13,7 +13,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-I386 %s
 //
 // CHECK-LINUX-I386-NOT: "-u__llvm_profile_runtime"
-// CHECK-LINUX-I386: /Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}libclang_rt.profile-i386.a"
+// CHECK-LINUX-I386: /Inputs/resource_dir{{/|\\\\}}lib{{.*}}i386-unknown-linux{{.*}}libclang_rt.profile.a"
 // CHECK-LINUX-I386-NOT: "-u__llvm_profile_runtime"
 // CHECK-LINUX-I386: "-lc"
 //
@@ -24,7 +24,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-X86-64 %s
 //
 // CHECK-LINUX-X86-64: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
-// CHECK-LINUX-X86-64: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}libclang_rt.profile-x86_64.a" {{.*}} "-lc"
+// CHECK-LINUX-X86-64: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{.*}}linux{{.*}}libclang_rt.profile.a" {{.*}} "-lc"
 //
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-freebsd --coverage -fuse-ld=ld \
diff --git a/clang/test/Driver/fdefine-target-os-macros.c b/clang/test/Driver/fdefine-target-os-macros.c
index d7379dd3d539..a4de51e8e724 100644
--- a/clang/test/Driver/fdefine-target-os-macros.c
+++ b/clang/test/Driver/fdefine-target-os-macros.c
@@ -12,6 +12,7 @@
 // RUN:                -DIOS=0         \
 // RUN:                -DTV=0          \
 // RUN:                -DWATCH=0       \
+// RUN:                -DVISION=0      \
 // RUN:                -DDRIVERKIT=0   \
 // RUN:                -DMACCATALYST=0 \
 // RUN:                -DEMBEDDED=0    \
@@ -27,6 +28,7 @@
 // RUN:                -DIOS=1         \
 // RUN:                -DTV=0          \
 // RUN:                -DWATCH=0       \
+// RUN:                -DVISION=0      \
 // RUN:                -DDRIVERKIT=0   \
 // RUN:                -DMACCATALYST=0 \
 // RUN:                -DEMBEDDED=1    \
@@ -42,6 +44,7 @@
 // RUN:                -DIOS=1         \
 // RUN:                -DTV=0          \
 // RUN:                -DWATCH=0       \
+// RUN:                -DVISION=0      \
 // RUN:                -DDRIVERKIT=0   \
 // RUN:                -DMACCATALYST=1 \
 // RUN:                -DEMBEDDED=0    \
@@ -57,6 +60,7 @@
 // RUN:                -DIOS=1         \
 // RUN:                -DTV=0          \
 // RUN:                -DWATCH=0       \
+// RUN:                -DVISION=0      \
 // RUN:                -DDRIVERKIT=0   \
 // RUN:                -DMACCATALYST=0 \
 // RUN:                -DEMBEDDED=0    \
@@ -72,6 +76,7 @@
 // RUN:                -DIOS=0         \
 // RUN:                -DTV=1          \
 // RUN:                -DWATCH=0       \
+// RUN:                -DVISION=0      \
 // RUN:                -DDRIVERKIT=0   \
 // RUN:                -DMACCATALYST=0 \
 // RUN:                -DEMBEDDED=1    \
@@ -87,6 +92,7 @@
 // RUN:                -DIOS=0         \
 // RUN:                -DTV=1          \
 // RUN:                -DWATCH=0       \
+// RUN:                -DVISION=0      \
 // RUN:                -DDRIVERKIT=0   \
 // RUN:                -DMACCATALYST=0 \
 // RUN:                -DEMBEDDED=0    \
@@ -102,6 +108,7 @@
 // RUN:                -DIOS=0         \
 // RUN:                -DTV=0          \
 // RUN:                -DWATCH=1       \
+// RUN:                -DVISION=0      \
 // RUN:                -DDRIVERKIT=0   \
 // RUN:                -DMACCATALYST=0 \
 // RUN:                -DEMBEDDED=1    \
@@ -117,6 +124,39 @@
 // RUN:                -DIOS=0         \
 // RUN:                -DTV=0          \
 // RUN:                -DWATCH=1       \
+// RUN:                -DVISION=0      \
+// RUN:                -DDRIVERKIT=0   \
+// RUN:                -DMACCATALYST=0 \
+// RUN:                -DEMBEDDED=0    \
+// RUN:                -DSIMULATOR=1   \
+// RUN:                -DWINDOWS=0     \
+// RUN:                -DLINUX=0       \
+// RUN:                -DUNIX=0
+
+// RUN: %clang -dM -E --target=arm64-apple-xros %s 2>&1 \
+// RUN: | FileCheck %s -DMAC=1         \
+// RUN:                -DOSX=0         \
+// RUN:                -DIPHONE=1      \
+// RUN:                -DIOS=0         \
+// RUN:                -DTV=0          \
+// RUN:                -DWATCH=0       \
+// RUN:                -DVISION=1      \
+// RUN:                -DDRIVERKIT=0   \
+// RUN:                -DMACCATALYST=0 \
+// RUN:                -DEMBEDDED=1    \
+// RUN:                -DSIMULATOR=0   \
+// RUN:                -DWINDOWS=0     \
+// RUN:                -DLINUX=0       \
+// RUN:                -DUNIX=0
+
+// RUN: %clang -dM -E --target=arm64-apple-xros-simulator %s 2>&1 \
+// RUN: | FileCheck %s -DMAC=1         \
+// RUN:                -DOSX=0         \
+// RUN:                -DIPHONE=1      \
+// RUN:                -DIOS=0         \
+// RUN:                -DTV=0          \
+// RUN:                -DWATCH=0       \
+// RUN:                -DVISION=1      \
 // RUN:                -DDRIVERKIT=0   \
 // RUN:                -DMACCATALYST=0 \
 // RUN:                -DEMBEDDED=0    \
@@ -132,6 +172,7 @@
 // RUN:                -DIOS=0         \
 // RUN:                -DTV=0          \
 // RUN:                -DWATCH=0       \
+// RUN:                -DVISION=0      \
 // RUN:                -DDRIVERKIT=1   \
 // RUN:                -DMACCATALYST=0 \
 // RUN:                -DEMBEDDED=0    \
@@ -148,6 +189,7 @@
 // RUN:                -DIOS=0         \
 // RUN:                -DTV=0          \
 // RUN:                -DWATCH=0       \
+// RUN:                -DVISION=0      \
 // RUN:                -DDRIVERKIT=0   \
 // RUN:                -DMACCATALYST=0 \
 // RUN:                -DEMBEDDED=0    \
@@ -164,6 +206,7 @@
 // RUN:                -DIOS=0         \
 // RUN:                -DTV=0          \
 // RUN:                -DWATCH=0       \
+// RUN:                -DVISION=0      \
 // RUN:                -DDRIVERKIT=0   \
 // RUN:                -DMACCATALYST=0 \
 // RUN:                -DEMBEDDED=0    \
@@ -180,6 +223,7 @@
 // RUN:                -DIOS=0         \
 // RUN:                -DTV=0          \
 // RUN:                -DWATCH=0       \
+// RUN:                -DVISION=0      \
 // RUN:                -DDRIVERKIT=0   \
 // RUN:                -DMACCATALYST=0 \
 // RUN:                -DEMBEDDED=0    \
@@ -196,6 +240,7 @@
 // RUN:                -DIOS=0         \
 // RUN:                -DTV=0          \
 // RUN:                -DWATCH=0       \
+// RUN:                -DVISION=0      \
 // RUN:                -DDRIVERKIT=0   \
 // RUN:                -DMACCATALYST=0 \
 // RUN:                -DEMBEDDED=0    \
@@ -226,6 +271,7 @@
 // CHECK-DAG: #define TARGET_OS_IOS [[IOS]]
 // CHECK-DAG: #define TARGET_OS_TV [[TV]]
 // CHECK-DAG: #define TARGET_OS_WATCH [[WATCH]]
+// CHECK-DAG: #define TARGET_OS_VISION [[VISION]]
 // CHECK-DAG: #define TARGET_OS_DRIVERKIT [[DRIVERKIT]]
 // CHECK-DAG: #define TARGET_OS_MACCATALYST [[MACCATALYST]]
 // CHECK-DAG: #define TARGET_OS_SIMULATOR [[SIMULATOR]]
diff --git a/clang/test/Driver/fuchsia.c b/clang/test/Driver/fuchsia.c
index cfcb8ad0d3ab..ca53f0d107a0 100644
--- a/clang/test/Driver/fuchsia.c
+++ b/clang/test/Driver/fuchsia.c
@@ -187,7 +187,7 @@
 // CHECK-SCUDO-X86: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-SCUDO-X86: "-fsanitize=safe-stack,scudo"
 // CHECK-SCUDO-X86: "-pie"
-// CHECK-SCUDO-X86: "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}fuchsia{{/|\\\\}}libclang_rt.scudo_standalone-x86_64.so"
+// CHECK-SCUDO-X86: "[[RESOURCE_DIR]]{{/|\\\\}}lib{{.*}}fuchsia{{.*}}libclang_rt.scudo_standalone.so"
 
 // RUN: %clang -### %s --target=aarch64-unknown-fuchsia \
 // RUN:     -fsanitize=scudo 2>&1 \
@@ -197,7 +197,7 @@
 // CHECK-SCUDO-AARCH64: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-SCUDO-AARCH64: "-fsanitize=shadow-call-stack,scudo"
 // CHECK-SCUDO-AARCH64: "-pie"
-// CHECK-SCUDO-AARCH64: "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}fuchsia{{/|\\\\}}libclang_rt.scudo_standalone-aarch64.so"
+// CHECK-SCUDO-AARCH64: "[[RESOURCE_DIR]]{{/|\\\\}}lib{{.*}}fuchsia{{.*}}libclang_rt.scudo_standalone.so"
 
 // RUN: %clang -### %s --target=x86_64-unknown-fuchsia \
 // RUN:     -fsanitize=scudo -fPIC -shared 2>&1 \
@@ -206,7 +206,7 @@
 // RUN:     | FileCheck %s -check-prefix=CHECK-SCUDO-SHARED
 // CHECK-SCUDO-SHARED: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-SCUDO-SHARED: "-fsanitize=safe-stack,scudo"
-// CHECK-SCUDO-SHARED: "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}fuchsia{{/|\\\\}}libclang_rt.scudo_standalone-x86_64.so"
+// CHECK-SCUDO-SHARED: "[[RESOURCE_DIR]]{{/|\\\\}}lib{{.*}}fuchsia{{.*}}libclang_rt.scudo_standalone.so"
 
 // RUN: %clang -### %s --target=aarch64-unknown-fuchsia \
 // RUN:     -fsanitize=leak 2>&1 \
diff --git a/clang/test/Driver/instrprof-ld.c b/clang/test/Driver/instrprof-ld.c
index 9a58cd3a0be7..674580b349d4 100644
--- a/clang/test/Driver/instrprof-ld.c
+++ b/clang/test/Driver/instrprof-ld.c
@@ -7,7 +7,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-I386 %s
 //
 // CHECK-LINUX-I386: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
-// CHECK-LINUX-I386: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}libclang_rt.profile-i386.a" {{.*}} "-lc"
+// CHECK-LINUX-I386: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}i386-unknown-linux{{/|\\\\}}libclang_rt.profile.a" {{.*}} "-lc"
 //
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-linux -fprofile-instr-generate -fuse-ld=ld \
@@ -16,7 +16,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-X86-64 %s
 //
 // CHECK-LINUX-X86-64: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
-// CHECK-LINUX-X86-64: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}libclang_rt.profile-x86_64.a" {{.*}} "-lc"
+// CHECK-LINUX-X86-64: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{.*}}linux{{.*}}libclang_rt.profile.a" {{.*}} "-lc"
 //
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-linux -fprofile-instr-generate -nostdlib -fuse-ld=ld \
@@ -25,7 +25,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-NOSTDLIB-X86-64 %s
 //
 // CHECK-LINUX-NOSTDLIB-X86-64: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
-// CHECK-LINUX-NOSTDLIB-X86-64: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}libclang_rt.profile-x86_64.a"
+// CHECK-LINUX-NOSTDLIB-X86-64: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{.*}}linux{{.*}}libclang_rt.profile.a"
 //
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-freebsd -fprofile-instr-generate -fuse-ld=ld \
@@ -62,7 +62,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-I386-SHARED %s
 //
 // CHECK-LINUX-I386-SHARED: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
-// CHECK-LINUX-I386-SHARED: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}libclang_rt.profile-i386.a" {{.*}} "-lc"
+// CHECK-LINUX-I386-SHARED: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{.*}}i386-unknown-linux{{.*}}libclang_rt.profile.a" {{.*}} "-lc"
 //
 // RUN: %clang -### %s 2>&1 \
 // RUN:     -shared \
@@ -72,7 +72,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-X86-64-SHARED %s
 //
 // CHECK-LINUX-X86-64-SHARED: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
-// CHECK-LINUX-X86-64-SHARED: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}libclang_rt.profile-x86_64.a" {{.*}} "-lc"
+// CHECK-LINUX-X86-64-SHARED: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{.*}}linux{{.*}}libclang_rt.profile.a" {{.*}} "-lc"
 //
 // RUN: %clang -### %s 2>&1 \
 // RUN:     -shared \
diff --git a/clang/test/Driver/linux-ld.c b/clang/test/Driver/linux-ld.c
index b3ce5ca307a6..d77367b4ece7 100644
--- a/clang/test/Driver/linux-ld.c
+++ b/clang/test/Driver/linux-ld.c
@@ -61,15 +61,15 @@
 // CHECK-LD-RT: "--eh-frame-hdr"
 // CHECK-LD-RT: "-m" "elf_x86_64"
 // CHECK-LD-RT: "-dynamic-linker"
-// CHECK-LD-RT: "[[RESDIR]]{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}clang_rt.crtbegin-x86_64.o"
+// CHECK-LD-RT: "[[RESDIR]]{{/|\\\\}}lib{{/|\\\\}}x86_64-unknown-linux{{/|\\\\}}clang_rt.crtbegin.o"
 // CHECK-LD-RT: "-L[[SYSROOT]]/usr/lib/gcc/x86_64-unknown-linux/10.2.0"
 // CHECK-LD-RT: "-L[[SYSROOT]]/usr/lib/gcc/x86_64-unknown-linux/10.2.0/../../../../x86_64-unknown-linux/lib"
 // CHECK-LD-RT: "-L[[SYSROOT]]/lib"
 // CHECK-LD-RT: "-L[[SYSROOT]]/usr/lib"
-// CHECK-LD-RT: libclang_rt.builtins-x86_64.a"
+// CHECK-LD-RT: libclang_rt.builtins.a"
 // CHECK-LD-RT: "-lc"
-// CHECK-LD-RT: libclang_rt.builtins-x86_64.a"
-// CHECK-LD-RT: "[[RESDIR]]{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}clang_rt.crtend-x86_64.o"
+// CHECK-LD-RT: libclang_rt.builtins.a"
+// CHECK-LD-RT: "[[RESDIR]]{{/|\\\\}}lib{{/|\\\\}}x86_64-unknown-linux{{/|\\\\}}clang_rt.crtend.o"
 //
 // RUN: %clang -### %s -no-pie 2>&1 \
 // RUN:     --target=i686-unknown-linux \
@@ -84,15 +84,15 @@
 // CHECK-LD-RT-I686: "--eh-frame-hdr"
 // CHECK-LD-RT-I686: "-m" "elf_i386"
 // CHECK-LD-RT-I686: "-dynamic-linker"
-// CHECK-LD-RT-I686: "[[RESDIR]]{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}clang_rt.crtbegin-i386.o"
+// CHECK-LD-RT-I686: "[[RESDIR]]{{/|\\\\}}lib{{/|\\\\}}i686-unknown-linux{{/|\\\\}}clang_rt.crtbegin.o"
 // CHECK-LD-RT-I686: "-L[[SYSROOT]]/usr/lib/gcc/i686-unknown-linux/10.2.0"
 // CHECK-LD-RT-I686: "-L[[SYSROOT]]/usr/lib/gcc/i686-unknown-linux/10.2.0/../../../../i686-unknown-linux/lib"
 // CHECK-LD-RT-I686: "-L[[SYSROOT]]/lib"
 // CHECK-LD-RT-I686: "-L[[SYSROOT]]/usr/lib"
-// CHECK-LD-RT-I686: libclang_rt.builtins-i386.a"
+// CHECK-LD-RT-I686: libclang_rt.builtins.a"
 // CHECK-LD-RT-I686: "-lc"
-// CHECK-LD-RT-I686: libclang_rt.builtins-i386.a"
-// CHECK-LD-RT-I686: "[[RESDIR]]{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}clang_rt.crtend-i386.o"
+// CHECK-LD-RT-I686: libclang_rt.builtins.a"
+// CHECK-LD-RT-I686: "[[RESDIR]]{{/|\\\\}}lib{{/|\\\\}}i686-unknown-linux{{/|\\\\}}clang_rt.crtend.o"
 //
 // RUN: %clang -### %s -no-pie 2>&1 \
 // RUN:     --target=arm-linux-androideabi \
diff --git a/clang/test/Driver/print-libgcc-file-name-clangrt.c b/clang/test/Driver/print-libgcc-file-name-clangrt.c
index 19f9a3c28c31..ed740e0d2917 100644
--- a/clang/test/Driver/print-libgcc-file-name-clangrt.c
+++ b/clang/test/Driver/print-libgcc-file-name-clangrt.c
@@ -55,7 +55,7 @@
 // RUN:     --sysroot=%S/Inputs/resource_dir_with_arch_subdir \
 // RUN:     -resource-dir=%S/Inputs/resource_dir_with_arch_subdir 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-CLANGRT-ARM-BAREMETAL %s
-// CHECK-CLANGRT-ARM-BAREMETAL: libclang_rt.builtins-armv7m.a
+// CHECK-CLANGRT-ARM-BAREMETAL: libclang_rt.builtins.a
 
 // RUN: %clang -rtlib=compiler-rt -print-libgcc-file-name \
 // RUN:     --target=armv7m-vendor-none-eabi \
diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c
index 79ce70e19ff4..53e536d77292 100644
--- a/clang/test/Driver/sanitizer-ld.c
+++ b/clang/test/Driver/sanitizer-ld.c
@@ -8,9 +8,9 @@
 //
 // CHECK-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-ASAN-LINUX-NOT: "-lc"
-// CHECK-ASAN-LINUX: libclang_rt.asan-i386.a"
+// CHECK-ASAN-LINUX: libclang_rt.asan.a"
 // CHECK-ASAN-LINUX-NOT: "--export-dynamic"
-// CHECK-ASAN-LINUX: "--dynamic-list={{.*}}libclang_rt.asan-i386.a.syms"
+// CHECK-ASAN-LINUX: "--dynamic-list={{.*}}libclang_rt.asan.a.syms"
 // CHECK-ASAN-LINUX-NOT: "--export-dynamic"
 // CHECK-ASAN-LINUX: "-lpthread"
 // CHECK-ASAN-LINUX: "-lrt"
@@ -41,8 +41,8 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-EXECUTABLE-LINUX %s
 //
-// CHECK-ASAN-EXECUTABLE-LINUX: libclang_rt.asan_static-x86_64
-// CHECK-ASAN-EXECUTABLE-LINUX: libclang_rt.asan-x86_64
+// CHECK-ASAN-EXECUTABLE-LINUX: libclang_rt.asan_static
+// CHECK-ASAN-EXECUTABLE-LINUX: libclang_rt.asan
 
 // RUN: %clang -fsanitize=address -shared -### %s 2>&1  \
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld \
@@ -50,8 +50,8 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-SHARED-LINUX %s
 //
-// CHECK-ASAN-SHARED-LINUX: libclang_rt.asan_static-x86_64
-// CHECK-ASAN-SHARED-LINUX-NOT: libclang_rt.asan-x86_64
+// CHECK-ASAN-SHARED-LINUX: libclang_rt.asan_static
+// CHECK-ASAN-SHARED-LINUX-NOT: libclang_rt.asan
 
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=i386-unknown-linux -fuse-ld=ld -fsanitize=address -shared-libsan \
@@ -74,9 +74,9 @@
 //
 // CHECK-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-SHARED-ASAN-LINUX-NOT: "-lc"
-// CHECK-SHARED-ASAN-LINUX-NOT: libclang_rt.asan-i386.a"
-// CHECK-SHARED-ASAN-LINUX: libclang_rt.asan-i386.so"
-// CHECK-SHARED-ASAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan-preinit-i386.a" "--no-whole-archive"
+// CHECK-SHARED-ASAN-LINUX-NOT: libclang_rt.asan.a"
+// CHECK-SHARED-ASAN-LINUX: libclang_rt.asan.so"
+// CHECK-SHARED-ASAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan-preinit.a" "--no-whole-archive"
 // CHECK-SHARED-ASAN-LINUX-NOT: "-lpthread"
 // CHECK-SHARED-ASAN-LINUX-NOT: "-lrt"
 // CHECK-SHARED-ASAN-LINUX-NOT: "-ldl"
@@ -92,9 +92,9 @@
 //
 // CHECK-DSO-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "-lc"
-// CHECK-DSO-SHARED-ASAN-LINUX-NOT: libclang_rt.asan-i386.a"
-// CHECK-DSO-SHARED-ASAN-LINUX-NOT: "libclang_rt.asan-preinit-i386.a"
-// CHECK-DSO-SHARED-ASAN-LINUX: libclang_rt.asan-i386.so"
+// CHECK-DSO-SHARED-ASAN-LINUX-NOT: libclang_rt.asan.a"
+// CHECK-DSO-SHARED-ASAN-LINUX-NOT: "libclang_rt.asan-preinit.a"
+// CHECK-DSO-SHARED-ASAN-LINUX: libclang_rt.asan.so"
 // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "-lpthread"
 // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "-lrt"
 // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "-ldl"
@@ -153,7 +153,7 @@
 //
 // CHECK-ASAN-LINUX-CXX-STATIC: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-ASAN-LINUX-CXX-STATIC-NOT: stdc++
-// CHECK-ASAN-LINUX-CXX-STATIC: "--whole-archive" "{{.*}}libclang_rt.asan-i386.a" "--no-whole-archive"
+// CHECK-ASAN-LINUX-CXX-STATIC: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
 // CHECK-ASAN-LINUX-CXX-STATIC: stdc++
 
 // RUN: %clang -### %s 2>&1 \
@@ -270,10 +270,10 @@
 //
 // CHECK-TSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-TSAN-LINUX-CXX-NOT: stdc++
-// CHECK-TSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan-x86_64.a" "--no-whole-archive"
-// CHECK-TSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.tsan-x86_64.a.syms"
-// CHECK-TSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan_cxx-x86_64.a" "--no-whole-archive"
-// CHECK-TSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.tsan_cxx-x86_64.a.syms"
+// CHECK-TSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan.a" "--no-whole-archive"
+// CHECK-TSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.tsan.a.syms"
+// CHECK-TSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan_cxx.a" "--no-whole-archive"
+// CHECK-TSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.tsan_cxx.a.syms"
 // CHECK-TSAN-LINUX-CXX-NOT: "--export-dynamic"
 // CHECK-TSAN-LINUX-CXX: stdc++
 // CHECK-TSAN-LINUX-CXX: "-lpthread"
@@ -306,10 +306,10 @@
 //
 // CHECK-MSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-MSAN-LINUX-CXX-NOT: stdc++
-// CHECK-MSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan-x86_64.a" "--no-whole-archive"
-// CHECK-MSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.msan-x86_64.a.syms"
-// CHECK-MSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan_cxx-x86_64.a" "--no-whole-archive"
-// CHECK-MSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.msan_cxx-x86_64.a.syms"
+// CHECK-MSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive"
+// CHECK-MSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.msan.a.syms"
+// CHECK-MSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan_cxx.a" "--no-whole-archive"
+// CHECK-MSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.msan_cxx.a.syms"
 // CHECK-MSAN-LINUX-CXX-NOT: "--export-dynamic"
 // CHECK-MSAN-LINUX-CXX: stdc++
 // CHECK-MSAN-LINUX-CXX: "-lpthread"
@@ -400,7 +400,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-UBSAN-LINUX-SHAREDLIBASAN %s
 
 // CHECK-UBSAN-LINUX-SHAREDLIBASAN: "{{.*}}ld{{(.exe)?}}"
-// CHECK-UBSAN-LINUX-SHAREDLIBASAN: "{{.*}}libclang_rt.ubsan_standalone-i386.so{{.*}}"
+// CHECK-UBSAN-LINUX-SHAREDLIBASAN: "{{.*}}libclang_rt.ubsan_standalone.so{{.*}}"
 
 // RUN: %clang -fsanitize=undefined -fsanitize-link-c++-runtime -### %s 2>&1 \
 // RUN:     --target=i386-unknown-linux \
@@ -408,7 +408,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-UBSAN-LINUX-LINK-CXX %s
 // CHECK-UBSAN-LINUX-LINK-CXX-NOT: "-lstdc++"
-// CHECK-UBSAN-LINUX-LINK-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone_cxx-i386.a" "--no-whole-archive"
+// CHECK-UBSAN-LINUX-LINK-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone_cxx.a" "--no-whole-archive"
 // CHECK-UBSAN-LINUX-LINK-CXX-NOT: "-lstdc++"
 
 // RUN: %clangxx -fsanitize=undefined -### %s 2>&1 \
@@ -418,9 +418,9 @@
 // RUN:   | FileCheck --check-prefix=CHECK-UBSAN-LINUX-CXX %s
 // CHECK-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-UBSAN-LINUX-CXX-NOT: libclang_rt.asan
-// CHECK-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone-i386.a" "--no-whole-archive"
+// CHECK-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive"
 // CHECK-UBSAN-LINUX-CXX-NOT: libclang_rt.asan
-// CHECK-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone_cxx-i386.a" "--no-whole-archive"
+// CHECK-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone_cxx.a" "--no-whole-archive"
 // CHECK-UBSAN-LINUX-CXX-NOT: libclang_rt.asan
 // CHECK-UBSAN-LINUX-CXX: "-lstdc++"
 // CHECK-UBSAN-LINUX-CXX-NOT: libclang_rt.asan
@@ -433,7 +433,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-UBSAN-MINIMAL-LINUX %s
 // CHECK-UBSAN-MINIMAL-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-UBSAN-MINIMAL-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_minimal-i386.a" "--no-whole-archive"
+// CHECK-UBSAN-MINIMAL-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_minimal.a" "--no-whole-archive"
 // CHECK-UBSAN-MINIMAL-LINUX: "-lpthread"
 // CHECK-UBSAN-MINIMAL-LINUX: "-lresolv"
 
@@ -468,7 +468,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-UBSAN-LINUX %s
 // CHECK-ASAN-UBSAN-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-ASAN-UBSAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan-i386.a" "--no-whole-archive"
+// CHECK-ASAN-UBSAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
 // CHECK-ASAN-UBSAN-LINUX-NOT: libclang_rt.ubsan
 // CHECK-ASAN-UBSAN-LINUX-NOT: "-lstdc++"
 // CHECK-ASAN-UBSAN-LINUX: "-lpthread"
@@ -480,8 +480,8 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-UBSAN-LINUX-CXX %s
 // CHECK-ASAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan-i386.a" "--no-whole-archive"
-// CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan_cxx-i386.a" "--no-whole-archive"
+// CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
+// CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive"
 // CHECK-ASAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan
 // CHECK-ASAN-UBSAN-LINUX-CXX: "-lstdc++"
 // CHECK-ASAN-UBSAN-LINUX-CXX: "-lpthread"
@@ -493,7 +493,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-MSAN-UBSAN-LINUX-CXX %s
 // CHECK-MSAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-MSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan-x86_64.a" "--no-whole-archive"
+// CHECK-MSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive"
 // CHECK-MSAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan
 
 // RUN: %clangxx -fsanitize=thread,undefined -### %s 2>&1 \
@@ -502,7 +502,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-TSAN-UBSAN-LINUX-CXX %s
 // CHECK-TSAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-TSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan-x86_64.a" "--no-whole-archive"
+// CHECK-TSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan.a" "--no-whole-archive"
 // CHECK-TSAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan
 
 // RUN: %clang -fsanitize=undefined -### %s 2>&1 \
@@ -525,7 +525,7 @@
 // CHECK-LSAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-LSAN-LINUX-NOT: "-lc"
 // CHECK-LSAN-LINUX-NOT: libclang_rt.ubsan
-// CHECK-LSAN-LINUX: libclang_rt.lsan-x86_64.a"
+// CHECK-LSAN-LINUX: libclang_rt.lsan.a"
 // CHECK-LSAN-LINUX: "-lpthread"
 // CHECK-LSAN-LINUX: "-ldl"
 // CHECK-LSAN-LINUX: "-lresolv"
@@ -560,7 +560,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-LSAN-ASAN-LINUX %s
 // CHECK-LSAN-ASAN-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-LSAN-ASAN-LINUX-NOT: libclang_rt.lsan
-// CHECK-LSAN-ASAN-LINUX: libclang_rt.asan-x86_64
+// CHECK-LSAN-ASAN-LINUX: libclang_rt.asan
 // CHECK-LSAN-ASAN-LINUX-NOT: libclang_rt.lsan
 
 // RUN: %clang -fsanitize=address -fsanitize-coverage=func -### %s 2>&1 \
@@ -569,7 +569,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-COV-LINUX %s
 // CHECK-ASAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-ASAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan-x86_64.a" "--no-whole-archive"
+// CHECK-ASAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
 // CHECK-ASAN-COV-LINUX-NOT: libclang_rt.ubsan
 // CHECK-ASAN-COV-LINUX-NOT: "-lstdc++"
 // CHECK-ASAN-COV-LINUX: "-lpthread"
@@ -581,7 +581,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-MSAN-COV-LINUX %s
 // CHECK-MSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-MSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.msan-x86_64.a" "--no-whole-archive"
+// CHECK-MSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive"
 // CHECK-MSAN-COV-LINUX-NOT: libclang_rt.ubsan
 // CHECK-MSAN-COV-LINUX-NOT: "-lstdc++"
 // CHECK-MSAN-COV-LINUX: "-lpthread"
@@ -593,7 +593,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-DFSAN-COV-LINUX %s
 // CHECK-DFSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-DFSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.dfsan-x86_64.a" "--no-whole-archive"
+// CHECK-DFSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.dfsan.a" "--no-whole-archive"
 // CHECK-DFSAN-COV-LINUX-NOT: libclang_rt.ubsan
 // CHECK-DFSAN-COV-LINUX-NOT: "-lstdc++"
 // CHECK-DFSAN-COV-LINUX: "-lpthread"
@@ -605,7 +605,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-UBSAN-COV-LINUX %s
 // CHECK-UBSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-UBSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone-x86_64.a" "--no-whole-archive"
+// CHECK-UBSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive"
 // CHECK-UBSAN-COV-LINUX-NOT: "-lstdc++"
 // CHECK-UBSAN-COV-LINUX: "-lpthread"
 // CHECK-UBSAN-COV-LINUX: "-lresolv"
@@ -616,7 +616,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-COV-LINUX %s
 // CHECK-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone-x86_64.a" "--no-whole-archive"
+// CHECK-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive"
 // CHECK-COV-LINUX-NOT: "-lstdc++"
 // CHECK-COV-LINUX: "-lpthread"
 // CHECK-COV-LINUX: "-lresolv"
@@ -638,7 +638,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-CFI-DIAG-LINUX %s
 // CHECK-CFI-DIAG-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-CFI-DIAG-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.ubsan_standalone-x86_64.a" "--no-whole-archive"
+// CHECK-CFI-DIAG-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive"
 
 // Cross-DSO CFI links the CFI runtime.
 // RUN: not %clang -fsanitize=cfi -fsanitize-cfi-cross-dso -### %s 2>&1 \
@@ -647,7 +647,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-CFI-CROSS-DSO-LINUX %s
 // CHECK-CFI-CROSS-DSO-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-CFI-CROSS-DSO-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.cfi-x86_64.a" "--no-whole-archive"
+// CHECK-CFI-CROSS-DSO-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.cfi.a" "--no-whole-archive"
 // CHECK-CFI-CROSS-DSO-LINUX: -export-dynamic
 
 // Cross-DSO CFI with diagnostics links just the CFI runtime.
@@ -658,7 +658,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-CFI-CROSS-DSO-DIAG-LINUX %s
 // CHECK-CFI-CROSS-DSO-DIAG-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-CFI-CROSS-DSO-DIAG-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.cfi_diag-x86_64.a" "--no-whole-archive"
+// CHECK-CFI-CROSS-DSO-DIAG-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.cfi_diag.a" "--no-whole-archive"
 // CHECK-CFI-CROSS-DSO-DIAG-LINUX: -export-dynamic
 
 // Cross-DSO CFI on Android does not link runtime libraries.
@@ -710,7 +710,7 @@
 // CHECK-SAFESTACK-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-SAFESTACK-LINUX-NOT: "-lc"
 // CHECK-SAFESTACK-LINUX-NOT: whole-archive
-// CHECK-SAFESTACK-LINUX: libclang_rt.safestack-x86_64.a"
+// CHECK-SAFESTACK-LINUX: libclang_rt.safestack.a"
 // CHECK-SAFESTACK-LINUX: "-u" "__safestack_init"
 // CHECK-SAFESTACK-LINUX: "-lpthread"
 // CHECK-SAFESTACK-LINUX: "-ldl"
@@ -772,9 +772,9 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-CFI-STATS-LINUX %s
 // CHECK-CFI-STATS-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-CFI-STATS-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.stats_client-x86_64.a" "--no-whole-archive"
+// CHECK-CFI-STATS-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.stats_client.a" "--no-whole-archive"
 // CHECK-CFI-STATS-LINUX-NOT: "--whole-archive"
-// CHECK-CFI-STATS-LINUX: "{{[^"]*}}libclang_rt.stats-x86_64.a"
+// CHECK-CFI-STATS-LINUX: "{{[^"]*}}libclang_rt.stats.a"
 
 // RUN: not %clang -fsanitize=cfi -fsanitize-stats -### %s 2>&1 \
 // RUN:     --target=x86_64-apple-darwin -fuse-ld=ld \
@@ -896,7 +896,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-SCUDO-LINUX %s
 // CHECK-SCUDO-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-SCUDO-LINUX: "--whole-archive" "{{.*}}libclang_rt.scudo_standalone-i386.a" "--no-whole-archive"
+// CHECK-SCUDO-LINUX: "--whole-archive" "{{.*}}libclang_rt.scudo_standalone.a" "--no-whole-archive"
 // CHECK-SCUDO-LINUX-NOT: "-lstdc++"
 // CHECK-SCUDO-LINUX: "-lpthread"
 // CHECK-SCUDO-LINUX: "-ldl"
@@ -910,8 +910,8 @@
 //
 // CHECK-SCUDO-SHARED-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-SCUDO-SHARED-LINUX-NOT: "-lc"
-// CHECK-SCUDO-SHARED-LINUX-NOT: libclang_rt.scudo_standalone-i386.a"
-// CHECK-SCUDO-SHARED-LINUX: libclang_rt.scudo_standalone-i386.so"
+// CHECK-SCUDO-SHARED-LINUX-NOT: libclang_rt.scudo_standalone.a"
+// CHECK-SCUDO-SHARED-LINUX: libclang_rt.scudo_standalone.so"
 // CHECK-SCUDO-SHARED-LINUX-NOT: "-lpthread"
 // CHECK-SCUDO-SHARED-LINUX-NOT: "-lrt"
 // CHECK-SCUDO-SHARED-LINUX-NOT: "-ldl"
@@ -954,9 +954,9 @@
 //
 // CHECK-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-HWASAN-X86-64-LINUX-NOT: "-lc"
-// CHECK-HWASAN-X86-64-LINUX: libclang_rt.hwasan-x86_64.a"
+// CHECK-HWASAN-X86-64-LINUX: libclang_rt.hwasan.a"
 // CHECK-HWASAN-X86-64-LINUX-NOT: "--export-dynamic"
-// CHECK-HWASAN-X86-64-LINUX: "--dynamic-list={{.*}}libclang_rt.hwasan-x86_64.a.syms"
+// CHECK-HWASAN-X86-64-LINUX: "--dynamic-list={{.*}}libclang_rt.hwasan.a.syms"
 // CHECK-HWASAN-X86-64-LINUX-NOT: "--export-dynamic"
 // CHECK-HWASAN-X86-64-LINUX: "-lpthread"
 // CHECK-HWASAN-X86-64-LINUX: "-lrt"
@@ -971,7 +971,7 @@
 //
 // CHECK-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-SHARED-HWASAN-X86-64-LINUX-NOT: "-lc"
-// CHECK-SHARED-HWASAN-X86-64-LINUX: libclang_rt.hwasan-x86_64.so"
+// CHECK-SHARED-HWASAN-X86-64-LINUX: libclang_rt.hwasan.so"
 // CHECK-SHARED-HWASAN-X86-64-LINUX-NOT: "-lpthread"
 // CHECK-SHARED-HWASAN-X86-64-LINUX-NOT: "-lrt"
 // CHECK-SHARED-HWASAN-X86-64-LINUX-NOT: "-ldl"
@@ -987,7 +987,7 @@
 //
 // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX-NOT: "-lc"
-// CHECK-DSO-SHARED-HWASAN-X86-64-LINUX: libclang_rt.hwasan-x86_64.so"
+// CHECK-DSO-SHARED-HWASAN-X86-64-LINUX: libclang_rt.hwasan.so"
 // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX-NOT: "-lpthread"
 // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX-NOT: "-lrt"
 // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX-NOT: "-ldl"
@@ -1003,9 +1003,9 @@
 //
 // CHECK-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-HWASAN-AARCH64-LINUX-NOT: "-lc"
-// CHECK-HWASAN-AARCH64-LINUX: libclang_rt.hwasan-aarch64.a"
+// CHECK-HWASAN-AARCH64-LINUX: libclang_rt.hwasan.a"
 // CHECK-HWASAN-AARCH64-LINUX-NOT: "--export-dynamic"
-// CHECK-HWASAN-AARCH64-LINUX: "--dynamic-list={{.*}}libclang_rt.hwasan-aarch64.a.syms"
+// CHECK-HWASAN-AARCH64-LINUX: "--dynamic-list={{.*}}libclang_rt.hwasan.a.syms"
 // CHECK-HWASAN-AARCH64-LINUX-NOT: "--export-dynamic"
 // CHECK-HWASAN-AARCH64-LINUX: "-lpthread"
 // CHECK-HWASAN-AARCH64-LINUX: "-lrt"
@@ -1021,7 +1021,7 @@
 //
 // CHECK-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lc"
-// CHECK-SHARED-HWASAN-AARCH64-LINUX: libclang_rt.hwasan-aarch64.so"
+// CHECK-SHARED-HWASAN-AARCH64-LINUX: libclang_rt.hwasan.so"
 // CHECK-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lpthread"
 // CHECK-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lrt"
 // CHECK-SHARED-HWASAN-AARCH64-LINUX-NOT: "-ldl"
@@ -1037,7 +1037,7 @@
 //
 // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lc"
-// CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX: libclang_rt.hwasan-aarch64.so"
+// CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX: libclang_rt.hwasan.so"
 // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lpthread"
 // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lrt"
 // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX-NOT: "-ldl"
diff --git a/clang/test/Frontend/fixed_point_declarations.c b/clang/test/Frontend/fixed_point_declarations.c
index 04ed25d227ca..c80457674427 100644
--- a/clang/test/Frontend/fixed_point_declarations.c
+++ b/clang/test/Frontend/fixed_point_declarations.c
@@ -125,3 +125,21 @@ long _Fract           long_fract_zero     = 0.0lr;    // CHECK-DAG: @long_fract_
 unsigned short _Fract u_short_fract_zero  = 0.0uhr;   // CHECK-DAG: @u_short_fract_zero   = {{.*}}global i8  0, align 1
 unsigned  _Fract      u_fract_zero        = 0.0ur;    // CHECK-DAG: @u_fract_zero         = {{.*}}global i16 0, align 2
 unsigned long _Fract  u_long_fract_zero   = 0.0ulr;   // CHECK-DAG: @u_long_fract_zero    = {{.*}}global i32 0, align 4
+
+// Hex exponent suffix with E in hex digit sequence.
+unsigned short _Fract e1 = 0x1.e8p-1uhr;  // CHECK-DAG: @e1 = {{.*}}global i8 -12
+unsigned short _Fract e2 = 0x1.8ep-1uhr;  // CHECK-DAG: @e2 = {{.*}}global i8 -57
+unsigned short _Fract e3 = 0x1.ep-1uhr;  //  CHECK-DAG: @e3 = {{.*}}global i8 -16
+unsigned _Accum e4 = 0xep-1uk;  //  CHECK-DAG: @e4 = {{.*}}global i32 458752
+unsigned _Accum e5 = 0xe.1p-1uk;  //  CHECK-DAG: @e5 = {{.*}}global i32 460800
+unsigned _Accum e6 = 0xe.ep-1uk;  //  CHECK-DAG: @e6 = {{.*}}global i32 487424
+unsigned _Accum e7 = 0xe.e8p-1uk;  //  CHECK-DAG: @e7 = {{.*}}global i32 488448
+unsigned _Accum e8 = 0xe.8ep-1uk;  //  CHECK-DAG: @e8 = {{.*}}global i32 476928
+unsigned short _Fract E1 = 0x1.E8p-1uhr;  // CHECK-DAG: @E1 = {{.*}}global i8 -12
+unsigned short _Fract E2 = 0x1.8Ep-1uhr;  // CHECK-DAG: @E2 = {{.*}}global i8 -57
+unsigned short _Fract E3 = 0x1.Ep-1uhr;  //  CHECK-DAG: @E3 = {{.*}}global i8 -16
+unsigned _Accum E4 = 0xEp-1uk;  //  CHECK-DAG: @E4 = {{.*}}global i32 458752
+unsigned _Accum E5 = 0xE.1p-1uk;  //  CHECK-DAG: @E5 = {{.*}}global i32 460800
+unsigned _Accum E6 = 0xE.Ep-1uk;  //  CHECK-DAG: @E6 = {{.*}}global i32 487424
+unsigned _Accum E7 = 0xE.E8p-1uk;  //  CHECK-DAG: @E7 = {{.*}}global i32 488448
+unsigned _Accum E8 = 0xE.8Ep-1uk;  //  CHECK-DAG: @E8 = {{.*}}global i32 476928
diff --git a/clang/test/InstallAPI/basic.test b/clang/test/InstallAPI/basic.test
index 22b04792ca2c..5b41ccd517b0 100644
--- a/clang/test/InstallAPI/basic.test
+++ b/clang/test/InstallAPI/basic.test
@@ -16,6 +16,11 @@
 // CHECK-NOT: warning:  
 
 //--- basic_inputs.json
+{
+  "headers": [
+  ],
+  "version": "3"
+}
 
 //--- expected.tbd
 {
diff --git a/clang/test/InstallAPI/variables.test b/clang/test/InstallAPI/variables.test
new file mode 100644
index 000000000000..6272867911f1
--- /dev/null
+++ b/clang/test/InstallAPI/variables.test
@@ -0,0 +1,63 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: sed -e "s|SRC_DIR|%/t|g" %t/vars_inputs.json.in > %t/vars_inputs.json
+
+/// Check multiple targets are captured.
+// RUN: clang-installapi -target arm64-apple-macos13.1 -target arm64e-apple-macos13.1 \
+// RUN: -fapplication-extension -install_name /usr/lib/vars.dylib \
+// RUN: %t/vars_inputs.json -o %t/vars.tbd 2>&1 | FileCheck %s --allow-empty
+// RUN: llvm-readtapi -compare %t/vars.tbd %t/expected.tbd 2>&1 | FileCheck %s --allow-empty
+
+// CHECK-NOT: error:  
+// CHECK-NOT: warning:  
+
+//--- vars.h
+extern int foo;
+
+//--- vars_inputs.json.in
+{
+  "headers": [ {
+    "path" : "SRC_DIR/vars.h",
+    "type" : "public"
+  }],
+  "version": "3"
+}
+
+//--- expected.tbd
+{
+  "main_library": {
+    "compatibility_versions": [
+      {
+        "version": "0"
+      }],
+    "current_versions": [
+      {
+        "version": "0"
+      }],
+    "install_names": [
+      {
+        "name": "/usr/lib/vars.dylib"
+      }
+    ],
+    "exported_symbols": [
+      {
+        "data": {
+          "global": [
+            "_foo"
+          ]
+        }
+      }
+    ],
+    "target_info": [
+      {
+        "min_deployment": "13.1",
+        "target": "arm64-macos"
+      },
+      {
+        "min_deployment": "13.1",
+        "target": "arm64e-macos"
+      }
+    ]
+  },
+  "tapi_tbd_version": 5
+}
diff --git a/clang/test/Sema/builtin-popcountg.c b/clang/test/Sema/builtin-popcountg.c
new file mode 100644
index 000000000000..e18b910046ff
--- /dev/null
+++ b/clang/test/Sema/builtin-popcountg.c
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -triple=x86_64-pc-linux-gnu -fsyntax-only -verify -Wpedantic %s
+
+typedef int int2 __attribute__((ext_vector_type(2)));
+
+void test_builtin_popcountg(int i, double d, int2 i2) {
+  __builtin_popcountg();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+  __builtin_popcountg(i, i);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+  __builtin_popcountg(d);
+  // expected-error@-1 {{1st argument must be a type of integer (was 'double')}}
+  __builtin_popcountg(i2);
+  // expected-error@-1 {{1st argument must be a type of integer (was 'int2' (vector of 2 'int' values))}}
+}
diff --git a/clang/test/Sema/builtins-elementwise-math.c b/clang/test/Sema/builtins-elementwise-math.c
index e7b36285fa7d..2e05337273ee 100644
--- a/clang/test/Sema/builtins-elementwise-math.c
+++ b/clang/test/Sema/builtins-elementwise-math.c
@@ -76,6 +76,7 @@ void test_builtin_elementwise_add_sat(int i, short s, double d, float4 v, int3 i
 
   enum f { three };
   enum f x = __builtin_elementwise_add_sat(one, three);
+  // expected-warning@-1 {{comparison of different enumeration types ('enum e' and 'enum f')}}
 
   _BitInt(32) ext; // expected-warning {{'_BitInt' in C17 and earlier is a Clang extension}}
   ext = __builtin_elementwise_add_sat(ext, ext);
@@ -134,6 +135,7 @@ void test_builtin_elementwise_sub_sat(int i, short s, double d, float4 v, int3 i
 
   enum f { three };
   enum f x = __builtin_elementwise_sub_sat(one, three);
+  // expected-warning@-1 {{comparison of different enumeration types ('enum e' and 'enum f')}}
 
   _BitInt(32) ext; // expected-warning {{'_BitInt' in C17 and earlier is a Clang extension}}
   ext = __builtin_elementwise_sub_sat(ext, ext);
@@ -189,6 +191,7 @@ void test_builtin_elementwise_max(int i, short s, double d, float4 v, int3 iv, u
 
   enum f { three };
   enum f x = __builtin_elementwise_max(one, three);
+  // expected-warning@-1 {{comparison of different enumeration types ('enum e' and 'enum f')}}
 
   _BitInt(32) ext; // expected-warning {{'_BitInt' in C17 and earlier is a Clang extension}}
   ext = __builtin_elementwise_max(ext, ext);
@@ -244,6 +247,7 @@ void test_builtin_elementwise_min(int i, short s, double d, float4 v, int3 iv, u
 
   enum f { three };
   enum f x = __builtin_elementwise_min(one, three);
+  // expected-warning@-1 {{comparison of different enumeration types ('enum e' and 'enum f')}}
 
   _BitInt(32) ext; // expected-warning {{'_BitInt' in C17 and earlier is a Clang extension}}
   ext = __builtin_elementwise_min(ext, ext);
diff --git a/clang/test/Sema/format-fixed-point.c b/clang/test/Sema/format-fixed-point.c
new file mode 100644
index 000000000000..47b22f1a7a5f
--- /dev/null
+++ b/clang/test/Sema/format-fixed-point.c
@@ -0,0 +1,148 @@
+// RUN: %clang_cc1 -ffixed-point -fsyntax-only -verify -Wformat -isystem %S/Inputs %s
+// RUN: %clang_cc1 -fsyntax-only -verify -Wformat -isystem %S/Inputs %s -DWITHOUT_FIXED_POINT
+
+int printf(const char *restrict, ...);
+
+short s;
+unsigned short us;
+int i;
+unsigned int ui;
+long l;
+unsigned long ul;
+float fl;
+double d;
+char c;
+unsigned char uc;
+
+#ifndef WITHOUT_FIXED_POINT
+short _Fract sf;
+_Fract f;
+long _Fract lf;
+unsigned short _Fract usf;
+unsigned _Fract uf;
+unsigned long _Fract ulf;
+short _Accum sa;
+_Accum a;
+long _Accum la;
+unsigned short _Accum usa;
+unsigned _Accum ua;
+unsigned long _Accum ula;
+_Sat short _Fract sat_sf;
+_Sat _Fract sat_f;
+_Sat long _Fract sat_lf;
+_Sat unsigned short _Fract sat_usf;
+_Sat unsigned _Fract sat_uf;
+_Sat unsigned long _Fract sat_ulf;
+_Sat short _Accum sat_sa;
+_Sat _Accum sat_a;
+_Sat long _Accum sat_la;
+_Sat unsigned short _Accum sat_usa;
+_Sat unsigned _Accum sat_ua;
+_Sat unsigned long _Accum sat_ula;
+
+void test_invalid_args(void) {
+  /// None of these should match against a fixed point type.
+  printf("%r", s);   // expected-warning{{format specifies type '_Fract' but the argument has type 'short'}}
+  printf("%r", us);  // expected-warning{{format specifies type '_Fract' but the argument has type 'unsigned short'}}
+  printf("%r", i);   // expected-warning{{format specifies type '_Fract' but the argument has type 'int'}}
+  printf("%r", ui);  // expected-warning{{format specifies type '_Fract' but the argument has type 'unsigned int'}}
+  printf("%r", l);   // expected-warning{{format specifies type '_Fract' but the argument has type 'long'}}
+  printf("%r", ul);  // expected-warning{{format specifies type '_Fract' but the argument has type 'unsigned long'}}
+  printf("%r", fl);  // expected-warning{{format specifies type '_Fract' but the argument has type 'float'}}
+  printf("%r", d);   // expected-warning{{format specifies type '_Fract' but the argument has type 'double'}}
+  printf("%r", c);   // expected-warning{{format specifies type '_Fract' but the argument has type 'char'}}
+  printf("%r", uc);  // expected-warning{{format specifies type '_Fract' but the argument has type 'unsigned char'}}
+}
+
+void test_fixed_point_specifiers(void) {
+  printf("%r", f);
+  printf("%R", uf);
+  printf("%k", a);
+  printf("%K", ua);
+
+  /// Test different sizes.
+  printf("%r", sf);   // expected-warning{{format specifies type '_Fract' but the argument has type 'short _Fract'}}
+  printf("%r", lf);   // expected-warning{{format specifies type '_Fract' but the argument has type 'long _Fract'}}
+  printf("%R", usf);  // expected-warning{{format specifies type 'unsigned _Fract' but the argument has type 'unsigned short _Fract'}}
+  printf("%R", ulf);  // expected-warning{{format specifies type 'unsigned _Fract' but the argument has type 'unsigned long _Fract'}}
+  printf("%k", sa);   // expected-warning{{format specifies type '_Accum' but the argument has type 'short _Accum'}}
+  printf("%k", la);   // expected-warning{{format specifies type '_Accum' but the argument has type 'long _Accum'}}
+  printf("%K", usa);  // expected-warning{{format specifies type 'unsigned _Accum' but the argument has type 'unsigned short _Accum'}}
+  printf("%K", ula);  // expected-warning{{format specifies type 'unsigned _Accum' but the argument has type 'unsigned long _Accum'}}
+
+  /// Test signs.
+  printf("%r", uf);  // expected-warning{{format specifies type '_Fract' but the argument has type 'unsigned _Fract'}}
+  printf("%R", f);   // expected-warning{{format specifies type 'unsigned _Fract' but the argument has type '_Fract'}}
+  printf("%k", ua);  // expected-warning{{format specifies type '_Accum' but the argument has type 'unsigned _Accum'}}
+  printf("%K", a);   // expected-warning{{format specifies type 'unsigned _Accum' but the argument has type '_Accum'}}
+
+  /// Test between types.
+  printf("%r", a);   // expected-warning{{format specifies type '_Fract' but the argument has type '_Accum'}}
+  printf("%R", ua);  // expected-warning{{format specifies type 'unsigned _Fract' but the argument has type 'unsigned _Accum'}}
+  printf("%k", f);   // expected-warning{{format specifies type '_Accum' but the argument has type '_Fract'}}
+  printf("%K", uf);  // expected-warning{{format specifies type 'unsigned _Accum' but the argument has type 'unsigned _Fract'}}
+
+  /// Test saturated types.
+  printf("%r", sat_f);
+  printf("%R", sat_uf);
+  printf("%k", sat_a);
+  printf("%K", sat_ua);
+}
+
+void test_length_modifiers_and_flags(void) {
+  printf("%hr", sf);
+  printf("%lr", lf);
+  printf("%hR", usf);
+  printf("%lR", ulf);
+  printf("%hk", sa);
+  printf("%lk", la);
+  printf("%hK", usa);
+  printf("%lK", ula);
+
+  printf("%hr", sat_sf);
+  printf("%lr", sat_lf);
+  printf("%hR", sat_usf);
+  printf("%lR", sat_ulf);
+  printf("%hk", sat_sa);
+  printf("%lk", sat_la);
+  printf("%hK", sat_usa);
+  printf("%lK", sat_ula);
+
+  printf("%10r", f);
+  printf("%10.10r", f);
+  printf("%010r", f);
+  printf("%-10r", f);
+  printf("%.10r", f);
+  printf("%+r", f);
+  printf("% r", f);
+  printf("%#r", f);
+  printf("%#.r", f);
+  printf("%#.0r", f);
+
+  /// Test some invalid length modifiers.
+  printf("%zr", f);   // expected-warning{{length modifier 'z' results in undefined behavior or no effect with 'r' conversion specifier}}
+  printf("%llr", f);  // expected-warning{{length modifier 'll' results in undefined behavior or no effect with 'r' conversion specifier}}
+  printf("%hhr", f);  // expected-warning{{length modifier 'hh' results in undefined behavior or no effect with 'r' conversion specifier}}
+
+  // + on an unsigned fixed point type.
+  printf("%+hR", usf);  // expected-warning{{flag '+' results in undefined behavior with 'R' conversion specifier}}
+  printf("%+R", uf);    // expected-warning{{flag '+' results in undefined behavior with 'R' conversion specifier}}
+  printf("%+lR", ulf);  // expected-warning{{flag '+' results in undefined behavior with 'R' conversion specifier}}
+  printf("%+hK", usa);  // expected-warning{{flag '+' results in undefined behavior with 'K' conversion specifier}}
+  printf("%+K", ua);    // expected-warning{{flag '+' results in undefined behavior with 'K' conversion specifier}}
+  printf("%+lK", ula);  // expected-warning{{flag '+' results in undefined behavior with 'K' conversion specifier}}
+  printf("% hR", usf);  // expected-warning{{flag ' ' results in undefined behavior with 'R' conversion specifier}}
+  printf("% R", uf);    // expected-warning{{flag ' ' results in undefined behavior with 'R' conversion specifier}}
+  printf("% lR", ulf);  // expected-warning{{flag ' ' results in undefined behavior with 'R' conversion specifier}}
+  printf("% hK", usa);  // expected-warning{{flag ' ' results in undefined behavior with 'K' conversion specifier}}
+  printf("% K", ua);    // expected-warning{{flag ' ' results in undefined behavior with 'K' conversion specifier}}
+  printf("% lK", ula);  // expected-warning{{flag ' ' results in undefined behavior with 'K' conversion specifier}}
+}
+#else
+void test_fixed_point_specifiers_no_printf() {
+  printf("%k", i);  // expected-warning{{invalid conversion specifier 'k'}}  
+  printf("%K", i);  // expected-warning{{invalid conversion specifier 'K'}}
+  printf("%r", i);  // expected-warning{{invalid conversion specifier 'r'}}
+  printf("%R", i);  // expected-warning{{invalid conversion specifier 'R'}}
+}
+#endif  // WITHOUT_FIXED_POINT
diff --git a/clang/test/Sema/inline-asm-validate-mips.c b/clang/test/Sema/inline-asm-validate-mips.c
new file mode 100644
index 000000000000..7da248fe417b
--- /dev/null
+++ b/clang/test/Sema/inline-asm-validate-mips.c
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -triple mips64 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple mips64 -target-feature +soft-float -fsyntax-only -verify=softfloat %s
+
+// expected-no-diagnostics
+
+void test_f(float p) {
+  float result = p;
+  __asm__("" :: "f"(result)); // softfloat-error{{invalid input constraint 'f' in asm}}
+}
diff --git a/clang/test/Sema/warn-compare-enum-types-mismatch.c b/clang/test/Sema/warn-compare-enum-types-mismatch.c
new file mode 100644
index 000000000000..2b72aae16b97
--- /dev/null
+++ b/clang/test/Sema/warn-compare-enum-types-mismatch.c
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 -x c -fsyntax-only -verify -Wenum-compare -Wno-unused-comparison %s
+// RUN: %clang_cc1 -x c++ -fsyntax-only -verify -Wenum-compare -Wno-unused-comparison %s
+
+typedef enum EnumA {
+  A
+} EnumA;
+
+enum EnumB {
+  B
+};
+
+enum {
+  C
+};
+
+void foo(void) {
+  enum EnumA a = A;
+  enum EnumB b = B;
+  A == B;
+  // expected-warning@-1 {{comparison of different enumeration types}}
+  a == (B);
+  // expected-warning@-1 {{comparison of different enumeration types}}
+  a == b;
+  // expected-warning@-1 {{comparison of different enumeration types}}
+  A > B;
+  // expected-warning@-1 {{comparison of different enumeration types}}
+  A >= b;
+  // expected-warning@-1 {{comparison of different enumeration types}}
+  a > b;
+  // expected-warning@-1 {{comparison of different enumeration types}}
+  (A) <= ((B));
+  // expected-warning@-1 {{comparison of different enumeration types}}
+  a < B;
+  // expected-warning@-1 {{comparison of different enumeration types}}
+  a < b;
+  // expected-warning@-1 {{comparison of different enumeration types}}
+
+  // In the following cases we purposefully differ from GCC and dont warn 
+  a == C; 
+  A < C;
+  b >= C; 
+}
diff --git a/clang/test/Sema/warn-conditional-emum-types-mismatch.c b/clang/test/Sema/warn-conditional-enum-types-mismatch.c
index c9e2eddc7764..f039245b6fab 100644
--- a/clang/test/Sema/warn-conditional-emum-types-mismatch.c
+++ b/clang/test/Sema/warn-conditional-enum-types-mismatch.c
@@ -21,7 +21,7 @@ int get_flag(int cond) {
   #ifdef __cplusplus
   // expected-warning@-2 {{conditional expression between different enumeration types ('ro' and 'rw')}}
   #else 
-  // expected-no-diagnostics
+  // expected-warning@-4 {{conditional expression between different enumeration types ('enum ro' and 'enum rw')}}
   #endif
 }
 
diff --git a/clang/test/Sema/warn-overlap.c b/clang/test/Sema/warn-overlap.c
index 1eddfd1077fd..2db07ebcd17b 100644
--- a/clang/test/Sema/warn-overlap.c
+++ b/clang/test/Sema/warn-overlap.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -Wtautological-overlap-compare %s
-// RUN: %clang_cc1 -fsyntax-only -verify -Wall -Wno-unused -Wno-loop-analysis %s
+// RUN: %clang_cc1 -fsyntax-only -verify -Wtautological-overlap-compare -Wno-enum-compare %s
+// RUN: %clang_cc1 -fsyntax-only -verify -Wall -Wno-unused -Wno-loop-analysis -Wno-enum-compare %s
 
 #define mydefine 2
 
diff --git a/clang/test/SemaCXX/gh53815.cpp b/clang/test/SemaCXX/gh53815.cpp
new file mode 100644
index 000000000000..326c911c7bfa
--- /dev/null
+++ b/clang/test/SemaCXX/gh53815.cpp
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++20 %s
+// expected-no-diagnostics
+
+// Check that we don't crash due to forgetting to check for placeholders
+// in the RHS of '.*'.
+
+template <typename Fn>
+static bool has_explicitly_named_overload() {
+  return requires { Fn().*&Fn::operator(); };
+}
+
+int main() {
+  has_explicitly_named_overload<decltype([](auto){})>();
+}
+
+template <typename Fn>
+constexpr bool has_explicitly_named_overload_2() {
+  return requires { Fn().*&Fn::operator(); };
+}
+
+static_assert(!has_explicitly_named_overload_2<decltype([](auto){})>());
diff --git a/clang/test/SemaHLSL/VectorOverloadResolution.hlsl b/clang/test/SemaHLSL/VectorOverloadResolution.hlsl
index e07391f803f8..81fedc2de315 100644
--- a/clang/test/SemaHLSL/VectorOverloadResolution.hlsl
+++ b/clang/test/SemaHLSL/VectorOverloadResolution.hlsl
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -triple dxil-unknown-shadermodel6.6-library -S -fnative-half-type -finclude-default-header -o - -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s --check-prefixes=CHECKIR
 void Fn(double2 D);
 void Fn(half2 H);
 
@@ -28,3 +29,46 @@ void Fn2(int16_t2 S);
 void Call2(int2 I) {
   Fn2(I);
 }
+
+void Fn3( int64_t2 p0);
+
+// CHECK: FunctionDecl {{.*}} Call3 'void (half2)'
+// CHECK: CallExpr {{.*}} 'void'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(int64_t2)' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'void (int64_t2)' lvalue Function {{.*}} 'Fn3' 'void (int64_t2)'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int64_t2':'long __attribute__((ext_vector_type(2)))' <FloatingToIntegral>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'half2':'half __attribute__((ext_vector_type(2)))' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'half2':'half __attribute__((ext_vector_type(2)))' lvalue ParmVar {{.*}} 'p0' 'half2':'half __attribute__((ext_vector_type(2)))'
+// CHECKIR-LABEL: Call3
+// CHECKIR: %conv = fptosi <2 x half> {{.*}} to <2 x i64>
+void Call3(half2 p0) {
+  Fn3(p0);
+}
+
+// CHECK: FunctionDecl {{.*}} Call4 'void (float2)'
+// CHECK: CallExpr {{.*}} 'void'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(int64_t2)' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'void (int64_t2)' lvalue Function {{.*}} 'Fn3' 'void (int64_t2)'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int64_t2':'long __attribute__((ext_vector_type(2)))' <FloatingToIntegral>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float2':'float __attribute__((ext_vector_type(2)))' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float2':'float __attribute__((ext_vector_type(2)))' lvalue ParmVar {{.*}} 'p0' 'float2':'float __attribute__((ext_vector_type(2)))'
+// CHECKIR-LABEL: Call4
+// CHECKIR: {{.*}} = fptosi <2 x float> {{.*}} to <2 x i64>
+void Call4(float2 p0) {
+  Fn3(p0);
+}
+
+void Fn4( float2 p0);
+
+// CHECK: FunctionDecl {{.*}} Call5 'void (int64_t2)'
+// CHECK: CallExpr {{.*}} 'void'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float2)' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'void (float2)' lvalue Function {{.*}} 'Fn4' 'void (float2)'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float2':'float __attribute__((ext_vector_type(2)))' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int64_t2':'long __attribute__((ext_vector_type(2)))' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int64_t2':'long __attribute__((ext_vector_type(2)))' lvalue ParmVar {{.*}} 'p0' 'int64_t2':'long __attribute__((ext_vector_type(2)))'
+// CHECKIR-LABEL: Call5
+// CHECKIR: {{.*}} = sitofp <2 x i64> {{.*}} to <2 x float>
+void Call5(int64_t2 p0) {
+  Fn4(p0);
+}
diff --git a/clang/test/SemaTemplate/ms-lookup-template-base-classes.cpp b/clang/test/SemaTemplate/ms-lookup-template-base-classes.cpp
index 7856a0a16307..534a5dc9ddc1 100644
--- a/clang/test/SemaTemplate/ms-lookup-template-base-classes.cpp
+++ b/clang/test/SemaTemplate/ms-lookup-template-base-classes.cpp
@@ -71,6 +71,13 @@ public:
 
 template class B<int>;
 
+template<typename T> struct C;
+
+// Test lookup with incomplete lookup context
+template<typename T>
+auto C<T>::f() -> decltype(x) { } // expected-error {{use of undeclared identifier 'x'}}
+                                  // expected-error@-1 {{out-of-line definition of 'f' from class 'C<T>' without definition}}
+
 }
 
 
diff --git a/clang/tools/clang-installapi/CMakeLists.txt b/clang/tools/clang-installapi/CMakeLists.txt
index b8384c92c104..e05f4eac3ad1 100644
--- a/clang/tools/clang-installapi/CMakeLists.txt
+++ b/clang/tools/clang-installapi/CMakeLists.txt
@@ -14,6 +14,7 @@ add_clang_tool(clang-installapi
 
 clang_target_link_libraries(clang-installapi
   PRIVATE
+  clangAST
   clangInstallAPI
   clangBasic
   clangDriver
diff --git a/clang/tools/clang-installapi/ClangInstallAPI.cpp b/clang/tools/clang-installapi/ClangInstallAPI.cpp
index fc23ffd7ae6b..43c9fca0a82e 100644
--- a/clang/tools/clang-installapi/ClangInstallAPI.cpp
+++ b/clang/tools/clang-installapi/ClangInstallAPI.cpp
@@ -12,12 +12,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "Options.h"
-#include "clang/Basic/DiagnosticIDs.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/DiagnosticFrontend.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/DriverDiagnostic.h"
-#include "clang/Frontend/CompilerInstance.h"
+#include "clang/Driver/Tool.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
-#include "clang/InstallAPI/Context.h"
+#include "clang/InstallAPI/Frontend.h"
+#include "clang/Tooling/Tooling.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Option/Option.h"
 #include "llvm/Support/CommandLine.h"
@@ -27,7 +29,9 @@
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/TargetParser/Host.h"
+#include "llvm/TextAPI/RecordVisitor.h"
 #include "llvm/TextAPI/TextAPIWriter.h"
+#include <memory>
 
 using namespace clang;
 using namespace clang::installapi;
@@ -35,6 +39,36 @@ using namespace clang::driver::options;
 using namespace llvm::opt;
 using namespace llvm::MachO;
 
+static bool runFrontend(StringRef ProgName, bool Verbose,
+                        const InstallAPIContext &Ctx,
+                        llvm::vfs::InMemoryFileSystem *FS,
+                        const ArrayRef<std::string> InitialArgs) {
+
+  std::unique_ptr<llvm::MemoryBuffer> ProcessedInput = createInputBuffer(Ctx);
+  // Skip invoking cc1 when there are no header inputs.
+  if (!ProcessedInput)
+    return true;
+
+  if (Verbose)
+    llvm::errs() << getName(Ctx.Type) << " Headers:\n"
+                 << ProcessedInput->getBuffer() << "\n\n";
+
+  std::string InputFile = ProcessedInput->getBufferIdentifier().str();
+  FS->addFile(InputFile, /*ModTime=*/0, std::move(ProcessedInput));
+  // Reconstruct arguments with unique values like target triple or input
+  // headers.
+  std::vector<std::string> Args = {ProgName.data(), "-target",
+                                   Ctx.Slice->getTriple().str().c_str()};
+  llvm::copy(InitialArgs, std::back_inserter(Args));
+  Args.push_back(InputFile);
+
+  // Create & run invocation.
+  clang::tooling::ToolInvocation Invocation(
+      std::move(Args), std::make_unique<InstallAPIAction>(*Ctx.Slice), Ctx.FM);
+
+  return Invocation.run();
+}
+
 static bool run(ArrayRef<const char *> Args, const char *ProgName) {
   // Setup Diagnostics engine.
   IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts = new DiagnosticOptions();
@@ -48,9 +82,15 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
       new clang::DiagnosticIDs(), DiagOpts.get(),
       new clang::TextDiagnosticPrinter(llvm::errs(), DiagOpts.get()));
 
-  // Create file manager for all file operations.
+  // Create file manager for all file operations and holding in-memory generated
+  // inputs.
+  llvm::IntrusiveRefCntPtr<llvm::vfs::OverlayFileSystem> OverlayFileSystem(
+      new llvm::vfs::OverlayFileSystem(llvm::vfs::getRealFileSystem()));
+  llvm::IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> InMemoryFileSystem(
+      new llvm::vfs::InMemoryFileSystem);
+  OverlayFileSystem->pushOverlay(InMemoryFileSystem);
   IntrusiveRefCntPtr<clang::FileManager> FM(
-      new FileManager(clang::FileSystemOptions()));
+      new FileManager(clang::FileSystemOptions(), OverlayFileSystem));
 
   // Set up driver to parse input arguments.
   auto DriverArgs = llvm::ArrayRef(Args).slice(1);
@@ -71,7 +111,10 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
   Options Opts(*Diag, FM.get(), ArgList);
   if (Diag->hasErrorOccurred())
     return EXIT_FAILURE;
+
   InstallAPIContext Ctx = Opts.createContext();
+  if (Diag->hasErrorOccurred())
+    return EXIT_FAILURE;
 
   // Set up compilation.
   std::unique_ptr<CompilerInstance> CI(new CompilerInstance());
@@ -80,6 +123,21 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
   if (!CI->hasDiagnostics())
     return EXIT_FAILURE;
 
+  // Execute and gather AST results.
+  llvm::MachO::Records FrontendResults;
+  for (const auto &[Targ, Trip] : Opts.DriverOpts.Targets) {
+    for (const HeaderType Type :
+         {HeaderType::Public, HeaderType::Private, HeaderType::Project}) {
+      Ctx.Slice = std::make_shared<RecordsSlice>(Trip);
+      Ctx.Type = Type;
+      if (!runFrontend(ProgName, Opts.DriverOpts.Verbose, Ctx,
+                       InMemoryFileSystem.get(), Opts.getClangFrontendArgs()))
+        return EXIT_FAILURE;
+      FrontendResults.emplace_back(std::move(Ctx.Slice));
+    }
+  }
+
+  // After symbols have been collected, prepare to write output.
   auto Out = CI->createOutputFile(Ctx.OutputLoc, /*Binary=*/false,
                                   /*RemoveFileOnSignal=*/false,
                                   /*UseTemporary=*/false,
@@ -88,7 +146,13 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
     return EXIT_FAILURE;
 
   // Assign attributes for serialization.
-  InterfaceFile IF;
+  auto Symbols = std::make_unique<SymbolSet>();
+  for (const auto &FR : FrontendResults) {
+    SymbolConverter Converter(Symbols.get(), FR->getTarget());
+    FR->visit(Converter);
+  }
+
+  InterfaceFile IF(std::move(Symbols));
   for (const auto &TargetInfo : Opts.DriverOpts.Targets) {
     IF.addTarget(TargetInfo.first);
     IF.setFromBinaryAttrs(Ctx.BA, TargetInfo.first);
diff --git a/clang/tools/clang-installapi/Options.cpp b/clang/tools/clang-installapi/Options.cpp
index 562a643edfcf..7d45e999448d 100644
--- a/clang/tools/clang-installapi/Options.cpp
+++ b/clang/tools/clang-installapi/Options.cpp
@@ -9,6 +9,7 @@
 #include "Options.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
+#include "clang/InstallAPI/FileList.h"
 #include "llvm/Support/Program.h"
 #include "llvm/TargetParser/Host.h"
 
@@ -68,6 +69,8 @@ bool Options::processDriverOptions(InputArgList &Args) {
     }
   }
 
+  DriverOpts.Verbose = Args.hasArgNoClaim(OPT_v);
+
   return true;
 }
 
@@ -104,10 +107,21 @@ Options::Options(DiagnosticsEngine &Diag, FileManager *FM,
 
   if (!processLinkerOptions(ArgList))
     return;
+
+  /// Any remaining arguments should be handled by invoking the clang frontend.
+  for (const Arg *A : ArgList) {
+    if (A->isClaimed())
+      continue;
+    FrontendArgs.emplace_back(A->getAsString(ArgList));
+  }
+  FrontendArgs.push_back("-fsyntax-only");
 }
 
 InstallAPIContext Options::createContext() {
   InstallAPIContext Ctx;
+  Ctx.FM = FM;
+  Ctx.Diags = Diags;
+
   // InstallAPI requires two level namespacing.
   Ctx.BA.TwoLevelNamespace = true;
 
@@ -116,6 +130,21 @@ InstallAPIContext Options::createContext() {
   Ctx.BA.AppExtensionSafe = LinkerOpts.AppExtensionSafe;
   Ctx.FT = DriverOpts.OutFT;
   Ctx.OutputLoc = DriverOpts.OutputPath;
+
+  // Process inputs.
+  for (const std::string &ListPath : DriverOpts.FileLists) {
+    auto Buffer = FM->getBufferForFile(ListPath);
+    if (auto Err = Buffer.getError()) {
+      Diags->Report(diag::err_cannot_open_file) << ListPath;
+      return Ctx;
+    }
+    if (auto Err = FileListReader::loadHeaders(std::move(Buffer.get()),
+                                               Ctx.InputHeaders)) {
+      Diags->Report(diag::err_cannot_open_file) << ListPath;
+      return Ctx;
+    }
+  }
+
   return Ctx;
 }
 
diff --git a/clang/tools/clang-installapi/Options.h b/clang/tools/clang-installapi/Options.h
index 4a84166a6c91..f68addf19728 100644
--- a/clang/tools/clang-installapi/Options.h
+++ b/clang/tools/clang-installapi/Options.h
@@ -43,6 +43,9 @@ struct DriverOptions {
 
   /// \brief File encoding to print.
   llvm::MachO::FileType OutFT = llvm::MachO::FileType::TBD_V5;
+
+  /// \brief Print verbose output.
+  bool Verbose = false;
 };
 
 struct LinkerOptions {
@@ -78,9 +81,14 @@ public:
   Options(clang::DiagnosticsEngine &Diag, FileManager *FM,
           llvm::opt::InputArgList &Args);
 
+  /// \brief Get CC1 arguments after extracting out the irrelevant
+  /// ones.
+  std::vector<std::string> &getClangFrontendArgs() { return FrontendArgs; }
+
 private:
   DiagnosticsEngine *Diags;
   FileManager *FM;
+  std::vector<std::string> FrontendArgs;
 };
 
 } // namespace installapi
diff --git a/clang/tools/clang-offload-packager/ClangOffloadPackager.cpp b/clang/tools/clang-offload-packager/ClangOffloadPackager.cpp
index c36a5aa58cee..c6d5b31ab512 100644
--- a/clang/tools/clang-offload-packager/ClangOffloadPackager.cpp
+++ b/clang/tools/clang-offload-packager/ClangOffloadPackager.cpp
@@ -197,7 +197,7 @@ static Error unbundleImages() {
 
       if (Error E = writeArchive(
               Args["file"], Members, SymtabWritingMode::NormalSymtab,
-              Archive::getDefaultKindForHost(), true, false, nullptr))
+              Archive::getDefaultKind(), true, false, nullptr))
         return E;
     } else if (Args.count("file")) {
       if (Extracted.size() > 1)
diff --git a/clang/unittests/Format/FormatTestTableGen.cpp b/clang/unittests/Format/FormatTestTableGen.cpp
index 6c110beabca4..76b871e2e1a5 100644
--- a/clang/unittests/Format/FormatTestTableGen.cpp
+++ b/clang/unittests/Format/FormatTestTableGen.cpp
@@ -346,5 +346,19 @@ TEST_F(FormatTestTableGen, CondOperatorAlignment) {
                Style);
 }
 
+TEST_F(FormatTestTableGen, DefAlignment) {
+  FormatStyle Style = getGoogleStyle(FormatStyle::LK_TableGen);
+  Style.ColumnLimit = 60;
+  verifyFormat("def Def : Parent {}\n"
+               "def DefDef : Parent {}\n"
+               "def DefDefDef : Parent {}\n",
+               Style);
+  Style.AlignConsecutiveTableGenDefinitionColons.Enabled = true;
+  verifyFormat("def Def       : Parent {}\n"
+               "def DefDef    : Parent {}\n"
+               "def DefDefDef : Parent {}\n",
+               Style);
+}
+
 } // namespace format
 } // end namespace clang
diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index bbb4e8d7c333..8a2b138d8d70 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -771,8 +771,6 @@ mark_as_advanced(COMPILER_RT_ENABLE_INTERNAL_SYMBOLIZER)
 add_subdirectory(lib)
 
 if(COMPILER_RT_INCLUDE_TESTS)
-  add_subdirectory(unittests)
-  add_subdirectory(test)
   # Don't build llvm-lit for runtimes-build, it will clean up map_config.
   if (COMPILER_RT_STANDALONE_BUILD AND NOT LLVM_RUNTIMES_BUILD)
     # If we have a valid source tree, generate llvm-lit into the bin directory.
@@ -782,11 +780,17 @@ if(COMPILER_RT_INCLUDE_TESTS)
       # Needed for lit support in standalone builds.
       include(AddLLVM)
       add_subdirectory(${LLVM_MAIN_SRC_DIR}/utils/llvm-lit ${CMAKE_CURRENT_BINARY_DIR}/llvm-lit)
+      # Ensure that the testsuite uses the local lit rather than
+      # LLVM_INSTALL_DIR/bin/llvm-lit (which probably does not exist).
+      get_llvm_lit_path(_base_dir _file_name)
+      set(LLVM_EXTERNAL_LIT "${_base_dir}/${_file_name}" CACHE STRING "Command used to spawn lit" FORCE)
     elseif(NOT EXISTS ${LLVM_EXTERNAL_LIT})
       message(WARNING "Could not find LLVM source directory and LLVM_EXTERNAL_LIT does not"
                        "point to a valid file.  You will not be able to run tests.")
     endif()
   endif()
+  add_subdirectory(unittests)
+  add_subdirectory(test)
 endif()
 
 add_subdirectory(tools)
diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index fce407f547f3..e9866d94b762 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -96,6 +96,25 @@ INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes, \
 #undef INSTR_PROF_DATA
 /* INSTR_PROF_DATA end. */
 
+/* For a virtual table object, record the name hash to associate profiled
+ * addresses with global variables, and record {starting address, size in bytes}
+ * to map the profiled virtual table (which usually have an offset from the
+ * starting address) back to a virtual table object. */
+#ifndef INSTR_PROF_VTABLE_DATA
+#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Initializer)
+#else
+#define INSTR_PROF_VTABLE_DATA_DEFINED
+#endif
+INSTR_PROF_VTABLE_DATA(const uint64_t, llvm::Type::getInt64Ty(Ctx), \
+                       VTableNameHash, ConstantInt::get(llvm::Type::getInt64Ty(Ctx), \
+                       IndexedInstrProf::ComputeHash(PGOVTableName)))
+INSTR_PROF_VTABLE_DATA(const IntPtrT, llvm::PointerType::getUnqual(Ctx), \
+                       VTablePointer, VTableAddr)
+INSTR_PROF_VTABLE_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), VTableSize, \
+                       ConstantInt::get(llvm::Type::getInt32Ty(Ctx), \
+                                        VTableSizeVal))
+#undef INSTR_PROF_VTABLE_DATA
+/* INSTR_PROF_VTABLE_DATA end. */
 
 /* This is an internal data structure used by value profiler. It
  * is defined here to allow serialization code sharing by LLVM
@@ -147,6 +166,8 @@ INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta,
 INSTR_PROF_RAW_HEADER(uint64_t, BitmapDelta,
                       (uintptr_t)BitmapBegin - (uintptr_t)DataBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
+INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
+INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
 INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 #undef INSTR_PROF_RAW_HEADER
 /* INSTR_PROF_RAW_HEADER  end */
@@ -188,13 +209,26 @@ VALUE_PROF_FUNC_PARAM(uint32_t, CounterIndex, Type::getInt32Ty(Ctx))
 VALUE_PROF_KIND(IPVK_IndirectCallTarget, 0, "indirect call target")
 /* For memory intrinsic functions size profiling. */
 VALUE_PROF_KIND(IPVK_MemOPSize, 1, "memory intrinsic functions size")
+/* For virtual table address profiling, the address point of the virtual table
+ * (i.e., the address contained in objects pointing to a virtual table) are
+ * profiled. Note this may not be the address of the per C++ class virtual table
+ *  object (e.g., there might be an offset).
+ *
+ * The profiled addresses are stored in raw profile, together with the following
+ * two types of information.
+ * 1. The (starting and ending) addresses of per C++ class virtual table objects.
+ * 2. The (compressed) virtual table object names.
+ * RawInstrProfReader converts profiled virtual table addresses to virtual table
+ *  objects' MD5 hash.
+ */
+VALUE_PROF_KIND(IPVK_VTableTarget, 2, "The profiled address point of the vtable")
 /* These two kinds must be the last to be
  * declared. This is to make sure the string
  * array created with the template can be
  * indexed with the kind value.
  */
 VALUE_PROF_KIND(IPVK_First, IPVK_IndirectCallTarget, "first")
-VALUE_PROF_KIND(IPVK_Last, IPVK_MemOPSize, "last")
+VALUE_PROF_KIND(IPVK_Last, IPVK_VTableTarget, "last")
 
 #undef VALUE_PROF_KIND
 /* VALUE_PROF_KIND end */
@@ -284,12 +318,18 @@ INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \
 INSTR_PROF_SECT_ENTRY(IPSK_name, \
                       INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON), \
                       INSTR_PROF_NAME_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_vname, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_VNAME_COMMON), \
+                      INSTR_PROF_VNAME_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_vals, \
                       INSTR_PROF_QUOTE(INSTR_PROF_VALS_COMMON), \
                       INSTR_PROF_VALS_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_vnodes, \
                       INSTR_PROF_QUOTE(INSTR_PROF_VNODES_COMMON), \
                       INSTR_PROF_VNODES_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_vtab, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_VTAB_COMMON), \
+                      INSTR_PROF_VTAB_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_covmap, \
                       INSTR_PROF_QUOTE(INSTR_PROF_COVMAP_COMMON), \
                       INSTR_PROF_COVMAP_COFF, "__LLVM_COV,")
@@ -668,9 +708,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
         (uint64_t)'f' << 16 | (uint64_t)'R' << 8 | (uint64_t)129
 
 /* Raw profile format version (start from 1). */
-#define INSTR_PROF_RAW_VERSION 9
+#define INSTR_PROF_RAW_VERSION 10
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 11
+#define INSTR_PROF_INDEX_VERSION 12
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 6
 
@@ -708,10 +748,12 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
    than WIN32 */
 #define INSTR_PROF_DATA_COMMON __llvm_prf_data
 #define INSTR_PROF_NAME_COMMON __llvm_prf_names
+#define INSTR_PROF_VNAME_COMMON __llvm_prf_vns
 #define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts
 #define INSTR_PROF_BITS_COMMON __llvm_prf_bits
 #define INSTR_PROF_VALS_COMMON __llvm_prf_vals
 #define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds
+#define INSTR_PROF_VTAB_COMMON __llvm_prf_vtab
 #define INSTR_PROF_COVMAP_COMMON __llvm_covmap
 #define INSTR_PROF_COVFUN_COMMON __llvm_covfun
 #define INSTR_PROF_COVDATA_COMMON __llvm_covdata
@@ -722,10 +764,12 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
  */
 #define INSTR_PROF_DATA_COFF ".lprfd$M"
 #define INSTR_PROF_NAME_COFF ".lprfn$M"
+#define INSTR_PROF_VNAME_COFF ".lprfvn$M"
 #define INSTR_PROF_CNTS_COFF ".lprfc$M"
 #define INSTR_PROF_BITS_COFF ".lprfb$M"
 #define INSTR_PROF_VALS_COFF ".lprfv$M"
 #define INSTR_PROF_VNODES_COFF ".lprfnd$M"
+#define INSTR_PROF_VTAB_COFF ".lprfvt$M"
 #define INSTR_PROF_COVMAP_COFF ".lcovmap$M"
 #define INSTR_PROF_COVFUN_COFF ".lcovfun$M"
 /* Since cov data and cov names sections are not allocated, we don't need to
@@ -741,6 +785,8 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_NAME_SECT_NAME INSTR_PROF_NAME_COFF
 #define INSTR_PROF_CNTS_SECT_NAME INSTR_PROF_CNTS_COFF
 #define INSTR_PROF_BITS_SECT_NAME INSTR_PROF_BITS_COFF
+#define INSTR_PROF_VTAB_SECT_NAME INSTR_PROF_VTAB_COFF
+#define INSTR_PROF_VNAME_SECT_NAME INSTR_PROF_VNAME_COFF
 /* Array of pointers. Each pointer points to a list
  * of value nodes associated with one value site.
  */
@@ -758,6 +804,8 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_NAME_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON)
 #define INSTR_PROF_CNTS_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_CNTS_COMMON)
 #define INSTR_PROF_BITS_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_BITS_COMMON)
+#define INSTR_PROF_VTAB_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_VTAB_COMMON)
+#define INSTR_PROF_VNAME_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_VNAME_COMMON)
 /* Array of pointers. Each pointer points to a list
  * of value nodes associated with one value site.
  */
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 28ded8766f25..83f7697a4a2b 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -916,7 +916,7 @@ cmake_dependent_option(COMPILER_RT_BUILD_CRT "Build crtbegin.o/crtend.o" ON "COM
 if (COMPILER_RT_BUILD_CRT)
   add_compiler_rt_component(crt)
 
-  option(COMPILER_RT_CRT_USE_EH_FRAME_REGISTRY "Use eh_frame in crtbegin.o/crtend.o" ON)
+  option(COMPILER_RT_CRT_USE_EH_FRAME_REGISTRY "Use eh_frame in crtbegin.o/crtend.o" OFF)
 
   include(CheckSectionExists)
   check_section_exists(".init_array" COMPILER_RT_HAS_INITFINI_ARRAY
diff --git a/compiler-rt/lib/hwasan/hwasan_report.cpp b/compiler-rt/lib/hwasan/hwasan_report.cpp
index 016ec182dea0..9fbf38ae6a1f 100644
--- a/compiler-rt/lib/hwasan/hwasan_report.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_report.cpp
@@ -27,6 +27,7 @@
 #include "sanitizer_common/sanitizer_flags.h"
 #include "sanitizer_common/sanitizer_internal_defs.h"
 #include "sanitizer_common/sanitizer_mutex.h"
+#include "sanitizer_common/sanitizer_placement_new.h"
 #include "sanitizer_common/sanitizer_report_decorator.h"
 #include "sanitizer_common/sanitizer_stackdepot.h"
 #include "sanitizer_common/sanitizer_stacktrace_printer.h"
diff --git a/compiler-rt/lib/hwasan/hwasan_thread_list.cpp b/compiler-rt/lib/hwasan/hwasan_thread_list.cpp
index 7df4dd3d7851..e56d19aad267 100644
--- a/compiler-rt/lib/hwasan/hwasan_thread_list.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_thread_list.cpp
@@ -1,5 +1,6 @@
 #include "hwasan_thread_list.h"
 
+#include "sanitizer_common/sanitizer_placement_new.h"
 #include "sanitizer_common/sanitizer_thread_arg_retval.h"
 
 namespace __hwasan {
diff --git a/compiler-rt/lib/hwasan/hwasan_thread_list.h b/compiler-rt/lib/hwasan/hwasan_thread_list.h
index 82f6c70a03f8..d0eebd1b373a 100644
--- a/compiler-rt/lib/hwasan/hwasan_thread_list.h
+++ b/compiler-rt/lib/hwasan/hwasan_thread_list.h
@@ -47,7 +47,6 @@
 #include "hwasan_allocator.h"
 #include "hwasan_flags.h"
 #include "hwasan_thread.h"
-#include "sanitizer_common/sanitizer_placement_new.h"
 #include "sanitizer_common/sanitizer_thread_arg_retval.h"
 
 namespace __hwasan {
diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h
index 012390833691..d424a22c212c 100644
--- a/compiler-rt/lib/profile/InstrProfiling.h
+++ b/compiler-rt/lib/profile/InstrProfiling.h
@@ -49,6 +49,12 @@ typedef struct ValueProfNode {
 #include "profile/InstrProfData.inc"
 } ValueProfNode;
 
+typedef void *IntPtrT;
+typedef struct COMPILER_RT_ALIGNAS(INSTR_PROF_DATA_ALIGNMENT) VTableProfData {
+#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Initializer) Type Name;
+#include "profile/InstrProfData.inc"
+} VTableProfData;
+
 /*!
  * \brief Return 1 if profile counters are continuously synced to the raw
  * profile via an mmap(). This is in contrast to the default mode, in which
@@ -103,12 +109,16 @@ const __llvm_profile_data *__llvm_profile_begin_data(void);
 const __llvm_profile_data *__llvm_profile_end_data(void);
 const char *__llvm_profile_begin_names(void);
 const char *__llvm_profile_end_names(void);
+const char *__llvm_profile_begin_vtabnames(void);
+const char *__llvm_profile_end_vtabnames(void);
 char *__llvm_profile_begin_counters(void);
 char *__llvm_profile_end_counters(void);
 char *__llvm_profile_begin_bitmap(void);
 char *__llvm_profile_end_bitmap(void);
 ValueProfNode *__llvm_profile_begin_vnodes();
 ValueProfNode *__llvm_profile_end_vnodes();
+const VTableProfData *__llvm_profile_begin_vtables();
+const VTableProfData *__llvm_profile_end_vtables();
 uint32_t *__llvm_profile_begin_orderfile();
 
 /*!
@@ -252,20 +262,31 @@ uint64_t __llvm_profile_get_num_bitmap_bytes(const char *Begin,
 /*! \brief Get the size of the profile name section in bytes. */
 uint64_t __llvm_profile_get_name_size(const char *Begin, const char *End);
 
-/* ! \brief Given the sizes of the data and counter information, return the
- * number of padding bytes before and after the counters, and after the names,
- * in the raw profile.
+/*! \brief Get the number of virtual table profile data entries */
+uint64_t __llvm_profile_get_num_vtable(const VTableProfData *Begin,
+                                       const VTableProfData *End);
+
+/*! \brief Get the size of virtual table profile data in bytes. */
+uint64_t __llvm_profile_get_vtable_section_size(const VTableProfData *Begin,
+                                                const VTableProfData *End);
+
+/* ! \brief Given the sizes of the data and counter information, computes the
+ * number of padding bytes before and after the counter section, as well as the
+ * number of padding bytes after other setions in the raw profile.
+ * Returns -1 upon errors and 0 upon success. Output parameters should be used
+ * iff return value is 0.
  *
  * Note: When mmap() mode is disabled, no padding bytes before/after counters
  * are needed. However, in mmap() mode, the counter section in the raw profile
  * must be page-aligned: this API computes the number of padding bytes
  * needed to achieve that.
  */
-void __llvm_profile_get_padding_sizes_for_counters(
+int __llvm_profile_get_padding_sizes_for_counters(
     uint64_t DataSize, uint64_t CountersSize, uint64_t NumBitmapBytes,
-    uint64_t NamesSize, uint64_t *PaddingBytesBeforeCounters,
-    uint64_t *PaddingBytesAfterCounters, uint64_t *PaddingBytesAfterBitmap,
-    uint64_t *PaddingBytesAfterNames);
+    uint64_t NamesSize, uint64_t VTableSize, uint64_t VNameSize,
+    uint64_t *PaddingBytesBeforeCounters, uint64_t *PaddingBytesAfterCounters,
+    uint64_t *PaddingBytesAfterBitmap, uint64_t *PaddingBytesAfterNames,
+    uint64_t *PaddingBytesAfterVTable, uint64_t *PaddingBytesAfterVNames);
 
 /*!
  * \brief Set the flag that profile data has been dumped to the file.
@@ -294,7 +315,8 @@ COMPILER_RT_VISIBILITY extern int INSTR_PROF_PROFILE_RUNTIME_VAR;
  * variable is defined as weak so that compiler can emit an overriding
  * definition depending on user option.
  */
-extern uint64_t INSTR_PROF_RAW_VERSION_VAR; /* __llvm_profile_raw_version */
+COMPILER_RT_VISIBILITY extern uint64_t
+    INSTR_PROF_RAW_VERSION_VAR; /* __llvm_profile_raw_version */
 
 /*!
  * This variable is a weak symbol defined in InstrProfiling.c. It allows
diff --git a/compiler-rt/lib/profile/InstrProfilingBuffer.c b/compiler-rt/lib/profile/InstrProfilingBuffer.c
index af52804b2b53..1c451d7ec756 100644
--- a/compiler-rt/lib/profile/InstrProfilingBuffer.c
+++ b/compiler-rt/lib/profile/InstrProfilingBuffer.c
@@ -51,12 +51,18 @@ uint64_t __llvm_profile_get_size_for_buffer(void) {
   const char *BitmapEnd = __llvm_profile_end_bitmap();
   const char *NamesBegin = __llvm_profile_begin_names();
   const char *NamesEnd = __llvm_profile_end_names();
+  const VTableProfData *VTableBegin = __llvm_profile_begin_vtables();
+  const VTableProfData *VTableEnd = __llvm_profile_end_vtables();
+  const char *VNamesBegin = __llvm_profile_begin_vtabnames();
+  const char *VNamesEnd = __llvm_profile_end_vtabnames();
 
   return __llvm_profile_get_size_for_buffer_internal(
       DataBegin, DataEnd, CountersBegin, CountersEnd, BitmapBegin, BitmapEnd,
-      NamesBegin, NamesEnd);
+      NamesBegin, NamesEnd, VTableBegin, VTableEnd, VNamesBegin, VNamesEnd);
 }
 
+// NOTE: Caller should guarantee that `Begin` and `End` specifies a half-open
+// interval [Begin, End). Namely, `End` is one-byte past the end of the array.
 COMPILER_RT_VISIBILITY
 uint64_t __llvm_profile_get_num_data(const __llvm_profile_data *Begin,
                                      const __llvm_profile_data *End) {
@@ -71,6 +77,26 @@ uint64_t __llvm_profile_get_data_size(const __llvm_profile_data *Begin,
   return __llvm_profile_get_num_data(Begin, End) * sizeof(__llvm_profile_data);
 }
 
+// Counts the number of `VTableProfData` elements within the range of [Begin,
+// End). Caller should guarantee that End points to one byte past the inclusive
+// range.
+// FIXME: Add a compiler-rt test to make sure the number of vtables in the
+// raw profile is the same as the number of vtable elements in the instrumented
+// binary.
+COMPILER_RT_VISIBILITY
+uint64_t __llvm_profile_get_num_vtable(const VTableProfData *Begin,
+                                       const VTableProfData *End) {
+  // Convert pointers to intptr_t to use integer arithmetic.
+  intptr_t EndI = (intptr_t)End, BeginI = (intptr_t)Begin;
+  return (EndI - BeginI) / sizeof(VTableProfData);
+}
+
+COMPILER_RT_VISIBILITY
+uint64_t __llvm_profile_get_vtable_section_size(const VTableProfData *Begin,
+                                                const VTableProfData *End) {
+  return (intptr_t)(End) - (intptr_t)(Begin);
+}
+
 COMPILER_RT_VISIBILITY size_t __llvm_profile_counter_entry_size(void) {
   if (__llvm_profile_get_version() & VARIANT_MASK_BYTE_COVERAGE)
     return sizeof(uint8_t);
@@ -119,11 +145,13 @@ static int needsCounterPadding(void) {
 }
 
 COMPILER_RT_VISIBILITY
-void __llvm_profile_get_padding_sizes_for_counters(
+int __llvm_profile_get_padding_sizes_for_counters(
     uint64_t DataSize, uint64_t CountersSize, uint64_t NumBitmapBytes,
-    uint64_t NamesSize, uint64_t *PaddingBytesBeforeCounters,
-    uint64_t *PaddingBytesAfterCounters, uint64_t *PaddingBytesAfterBitmapBytes,
-    uint64_t *PaddingBytesAfterNames) {
+    uint64_t NamesSize, uint64_t VTableSize, uint64_t VNameSize,
+    uint64_t *PaddingBytesBeforeCounters, uint64_t *PaddingBytesAfterCounters,
+    uint64_t *PaddingBytesAfterBitmapBytes, uint64_t *PaddingBytesAfterNames,
+    uint64_t *PaddingBytesAfterVTable, uint64_t *PaddingBytesAfterVName) {
+  // Counter padding is needed only if continuous mode is enabled.
   if (!needsCounterPadding()) {
     *PaddingBytesBeforeCounters = 0;
     *PaddingBytesAfterCounters =
@@ -131,9 +159,19 @@ void __llvm_profile_get_padding_sizes_for_counters(
     *PaddingBytesAfterBitmapBytes =
         __llvm_profile_get_num_padding_bytes(NumBitmapBytes);
     *PaddingBytesAfterNames = __llvm_profile_get_num_padding_bytes(NamesSize);
-    return;
+    if (PaddingBytesAfterVTable != NULL)
+      *PaddingBytesAfterVTable =
+          __llvm_profile_get_num_padding_bytes(VTableSize);
+    if (PaddingBytesAfterVName != NULL)
+      *PaddingBytesAfterVName = __llvm_profile_get_num_padding_bytes(VNameSize);
+    return 0;
   }
 
+  // Value profiling not supported in continuous mode at profile-write time.
+  // Return -1 to alert the incompatibility.
+  if (VTableSize != 0 || VNameSize != 0)
+    return -1;
+
   // In continuous mode, the file offsets for headers and for the start of
   // counter sections need to be page-aligned.
   *PaddingBytesBeforeCounters =
@@ -142,13 +180,22 @@ void __llvm_profile_get_padding_sizes_for_counters(
   *PaddingBytesAfterBitmapBytes =
       calculateBytesNeededToPageAlign(NumBitmapBytes);
   *PaddingBytesAfterNames = calculateBytesNeededToPageAlign(NamesSize);
+  // Set these two variables to zero to avoid uninitialized variables
+  // even if VTableSize and VNameSize are known to be zero.
+  if (PaddingBytesAfterVTable != NULL)
+    *PaddingBytesAfterVTable = 0;
+  if (PaddingBytesAfterVName != NULL)
+    *PaddingBytesAfterVName = 0;
+  return 0;
 }
 
 COMPILER_RT_VISIBILITY
 uint64_t __llvm_profile_get_size_for_buffer_internal(
     const __llvm_profile_data *DataBegin, const __llvm_profile_data *DataEnd,
     const char *CountersBegin, const char *CountersEnd, const char *BitmapBegin,
-    const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd) {
+    const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd,
+    const VTableProfData *VTableBegin, const VTableProfData *VTableEnd,
+    const char *VNamesBegin, const char *VNamesEnd) {
   /* Match logic in __llvm_profile_write_buffer(). */
   const uint64_t NamesSize = (NamesEnd - NamesBegin) * sizeof(char);
   uint64_t DataSize = __llvm_profile_get_data_size(DataBegin, DataEnd);
@@ -156,20 +203,29 @@ uint64_t __llvm_profile_get_size_for_buffer_internal(
       __llvm_profile_get_counters_size(CountersBegin, CountersEnd);
   const uint64_t NumBitmapBytes =
       __llvm_profile_get_num_bitmap_bytes(BitmapBegin, BitmapEnd);
+  const uint64_t VTableSize =
+      __llvm_profile_get_vtable_section_size(VTableBegin, VTableEnd);
+  const uint64_t VNameSize =
+      __llvm_profile_get_name_size(VNamesBegin, VNamesEnd);
 
   /* Determine how much padding is needed before/after the counters and after
    * the names. */
   uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
-      PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes;
+      PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes,
+      PaddingBytesAfterVTable, PaddingBytesAfterVNames;
   __llvm_profile_get_padding_sizes_for_counters(
-      DataSize, CountersSize, NumBitmapBytes, NamesSize,
-      &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
-      &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames);
+      DataSize, CountersSize, NumBitmapBytes, NamesSize, 0 /* VTableSize */,
+      0 /* VNameSize */, &PaddingBytesBeforeCounters,
+      &PaddingBytesAfterCounters, &PaddingBytesAfterBitmapBytes,
+      &PaddingBytesAfterNames, &PaddingBytesAfterVTable,
+      &PaddingBytesAfterVNames);
 
   return sizeof(__llvm_profile_header) + __llvm_write_binary_ids(NULL) +
          DataSize + PaddingBytesBeforeCounters + CountersSize +
          PaddingBytesAfterCounters + NumBitmapBytes +
-         PaddingBytesAfterBitmapBytes + NamesSize + PaddingBytesAfterNames;
+         PaddingBytesAfterBitmapBytes + NamesSize + PaddingBytesAfterNames +
+         VTableSize + PaddingBytesAfterVTable + VNameSize +
+         PaddingBytesAfterVNames;
 }
 
 COMPILER_RT_VISIBILITY
@@ -191,7 +247,10 @@ COMPILER_RT_VISIBILITY int __llvm_profile_write_buffer_internal(
     const char *NamesBegin, const char *NamesEnd) {
   ProfDataWriter BufferWriter;
   initBufferWriter(&BufferWriter, Buffer);
-  return lprofWriteDataImpl(&BufferWriter, DataBegin, DataEnd, CountersBegin,
-                            CountersEnd, BitmapBegin, BitmapEnd, 0, NamesBegin,
-                            NamesEnd, 0);
+  // Set virtual table arguments to NULL since they are not supported yet.
+  return lprofWriteDataImpl(
+      &BufferWriter, DataBegin, DataEnd, CountersBegin, CountersEnd,
+      BitmapBegin, BitmapEnd, /*VPDataReader=*/0, NamesBegin, NamesEnd,
+      /*VTableBegin=*/NULL, /*VTableEnd=*/NULL, /*VNamesBegin=*/NULL,
+      /*VNamesEnd=*/NULL, /*SkipNameDataWrite=*/0);
 }
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index f3b457d786e6..e4d99ef4872b 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -137,15 +137,18 @@ static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
              DataBegin, PageSize);
     return 1;
   }
+
   int Fileno = fileno(File);
   /* Determine how much padding is needed before/after the counters and
    * after the names. */
   uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
-      PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes;
+      PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes,
+      PaddingBytesAfterVTable, PaddingBytesAfterVNames;
   __llvm_profile_get_padding_sizes_for_counters(
-      DataSize, CountersSize, NumBitmapBytes, NamesSize,
-      &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
-      &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames);
+      DataSize, CountersSize, NumBitmapBytes, NamesSize, /*VTableSize=*/0,
+      /*VNameSize=*/0, &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
+      &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames,
+      &PaddingBytesAfterVTable, &PaddingBytesAfterVNames);
 
   uint64_t PageAlignedCountersLength = CountersSize + PaddingBytesAfterCounters;
   uint64_t FileOffsetToCounters = CurrentFileOffset +
diff --git a/compiler-rt/lib/profile/InstrProfilingInternal.h b/compiler-rt/lib/profile/InstrProfilingInternal.h
index 03ed67fcfa76..d5bd0e41fb12 100644
--- a/compiler-rt/lib/profile/InstrProfilingInternal.h
+++ b/compiler-rt/lib/profile/InstrProfilingInternal.h
@@ -22,7 +22,9 @@
 uint64_t __llvm_profile_get_size_for_buffer_internal(
     const __llvm_profile_data *DataBegin, const __llvm_profile_data *DataEnd,
     const char *CountersBegin, const char *CountersEnd, const char *BitmapBegin,
-    const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd);
+    const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd,
+    const VTableProfData *VTableBegin, const VTableProfData *VTableEnd,
+    const char *VNamesBegin, const char *VNamesEnd);
 
 /*!
  * \brief Write instrumentation data to the given buffer, given explicit
@@ -156,7 +158,9 @@ int lprofWriteDataImpl(ProfDataWriter *Writer,
                        const char *CountersBegin, const char *CountersEnd,
                        const char *BitmapBegin, const char *BitmapEnd,
                        VPDataReaderType *VPDataReader, const char *NamesBegin,
-                       const char *NamesEnd, int SkipNameDataWrite);
+                       const char *NamesEnd, const VTableProfData *VTableBegin,
+                       const VTableProfData *VTableEnd, const char *VNamesBegin,
+                       const char *VNamesEnd, int SkipNameDataWrite);
 
 /* Merge value profile data pointed to by SrcValueProfData into
  * in-memory profile counters pointed by to DstData.  */
diff --git a/compiler-rt/lib/profile/InstrProfilingMerge.c b/compiler-rt/lib/profile/InstrProfilingMerge.c
index b5850e99ee37..c0706b73e166 100644
--- a/compiler-rt/lib/profile/InstrProfilingMerge.c
+++ b/compiler-rt/lib/profile/InstrProfilingMerge.c
@@ -107,6 +107,26 @@ static uintptr_t signextIfWin64(void *V) {
 #endif
 }
 
+// Skip names section, vtable profile data section and vtable names section
+// for runtime profile merge. To merge runtime addresses from multiple
+// profiles collected from the same instrumented binary, the binary should be
+// loaded at fixed base address (e.g., build with -no-pie, or run with ASLR
+// disabled). In this set-up these three sections remain unchanged.
+static uint64_t
+getDistanceFromCounterToValueProf(const __llvm_profile_header *const Header) {
+  const uint64_t VTableSectionSize =
+      Header->NumVTables * sizeof(VTableProfData);
+  const uint64_t PaddingBytesAfterVTableSection =
+      __llvm_profile_get_num_padding_bytes(VTableSectionSize);
+  const uint64_t VNamesSize = Header->VNamesSize;
+  const uint64_t PaddingBytesAfterVNamesSize =
+      __llvm_profile_get_num_padding_bytes(VNamesSize);
+  return Header->NamesSize +
+         __llvm_profile_get_num_padding_bytes(Header->NamesSize) +
+         VTableSectionSize + PaddingBytesAfterVTableSection + VNamesSize +
+         PaddingBytesAfterVNamesSize;
+}
+
 COMPILER_RT_VISIBILITY
 int __llvm_profile_merge_from_buffer(const char *ProfileData,
                                      uint64_t ProfileSize) {
@@ -137,8 +157,7 @@ int __llvm_profile_merge_from_buffer(const char *ProfileData,
   SrcBitmapStart = SrcCountersEnd;
   SrcNameStart = SrcBitmapStart + Header->NumBitmapBytes;
   SrcValueProfDataStart =
-      SrcNameStart + Header->NamesSize +
-      __llvm_profile_get_num_padding_bytes(Header->NamesSize);
+      SrcNameStart + getDistanceFromCounterToValueProf(Header);
   if (SrcNameStart < SrcCountersStart || SrcNameStart < SrcBitmapStart)
     return 1;
 
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c b/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c
index 002bec164d7e..b9d51b698b41 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c
@@ -175,7 +175,8 @@ void __llvm_profile_register_names_function(void *NamesStart,
                                             uint64_t NamesSize) {}
 
 // The __start_SECNAME and __stop_SECNAME symbols (for SECNAME \in
-// {"__llvm_prf_cnts", "__llvm_prf_data", "__llvm_prf_name", "__llvm_prf_vnds"})
+// {"__llvm_prf_cnts", "__llvm_prf_data", "__llvm_prf_name", "__llvm_prf_vnds",
+// "__llvm_prf_vns", "__llvm_prf_vtab"})
 // are always live when linking on AIX, regardless if the .o's being linked
 // reference symbols from the profile library (for example when no files were
 // compiled with -fprofile-generate). That's because these symbols are kept
@@ -197,6 +198,10 @@ static int dummy_vnds[0] COMPILER_RT_SECTION(
     COMPILER_RT_SEG INSTR_PROF_VNODES_SECT_NAME);
 static int dummy_orderfile[0] COMPILER_RT_SECTION(
     COMPILER_RT_SEG INSTR_PROF_ORDERFILE_SECT_NAME);
+static int dummy_vname[0] COMPILER_RT_SECTION(
+    COMPILER_RT_SEG INSTR_PROF_VNAME_SECT_NAME);
+static int dummy_vtab[0] COMPILER_RT_SECTION(
+    COMPILER_RT_SEG INSTR_PROF_VTAB_SECT_NAME);
 
 // To avoid GC'ing of the dummy variables by the linker, reference them in an
 // array and reference the array in the runtime registration code
@@ -206,9 +211,10 @@ static int dummy_orderfile[0] COMPILER_RT_SECTION(
 #pragma GCC diagnostic ignored "-Wcast-qual"
 #endif
 COMPILER_RT_VISIBILITY
-void *__llvm_profile_keep[] = {(void *)&dummy_cnts, (void *)&dummy_bits,
-                               (void *)&dummy_data, (void *)&dummy_name,
-                               (void *)&dummy_vnds, (void *)&dummy_orderfile};
+void *__llvm_profile_keep[] = {(void *)&dummy_cnts,  (void *)&dummy_bits,
+                               (void *)&dummy_data,  (void *)&dummy_name,
+                               (void *)&dummy_vnds,  (void *)&dummy_orderfile,
+                               (void *)&dummy_vname, (void *)&dummy_vtab};
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
 #endif
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformDarwin.c b/compiler-rt/lib/profile/InstrProfilingPlatformDarwin.c
index 2154d242a817..6adc7f328cbf 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformDarwin.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformDarwin.c
@@ -36,6 +36,17 @@ extern char
 COMPILER_RT_VISIBILITY
 extern char BitmapEnd __asm("section$end$__DATA$" INSTR_PROF_BITS_SECT_NAME);
 COMPILER_RT_VISIBILITY
+extern VTableProfData
+    VTableProfStart __asm("section$start$__DATA$" INSTR_PROF_VTAB_SECT_NAME);
+COMPILER_RT_VISIBILITY
+extern VTableProfData
+    VTableProfEnd __asm("section$end$__DATA$" INSTR_PROF_VTAB_SECT_NAME);
+COMPILER_RT_VISIBILITY
+extern char
+    VNameStart __asm("section$start$__DATA$" INSTR_PROF_VNAME_SECT_NAME);
+COMPILER_RT_VISIBILITY
+extern char VNameEnd __asm("section$end$__DATA$" INSTR_PROF_VNAME_SECT_NAME);
+COMPILER_RT_VISIBILITY
 extern uint32_t
     OrderFileStart __asm("section$start$__DATA$" INSTR_PROF_ORDERFILE_SECT_NAME);
 
@@ -65,6 +76,18 @@ char *__llvm_profile_begin_bitmap(void) { return &BitmapStart; }
 COMPILER_RT_VISIBILITY
 char *__llvm_profile_end_bitmap(void) { return &BitmapEnd; }
 COMPILER_RT_VISIBILITY
+const VTableProfData *__llvm_profile_begin_vtables(void) {
+  return &VTableProfStart;
+}
+COMPILER_RT_VISIBILITY
+const VTableProfData *__llvm_profile_end_vtables(void) {
+  return &VTableProfEnd;
+}
+COMPILER_RT_VISIBILITY
+const char *__llvm_profile_begin_vtabnames(void) { return &VNameStart; }
+COMPILER_RT_VISIBILITY
+const char *__llvm_profile_end_vtabnames(void) { return &VNameEnd; }
+COMPILER_RT_VISIBILITY
 uint32_t *__llvm_profile_begin_orderfile(void) { return &OrderFileStart; }
 
 COMPILER_RT_VISIBILITY
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
index 19266ab6c6fb..b766436497b7 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
@@ -24,8 +24,12 @@
 #define PROF_DATA_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_DATA_COMMON)
 #define PROF_NAME_START INSTR_PROF_SECT_START(INSTR_PROF_NAME_COMMON)
 #define PROF_NAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_NAME_COMMON)
+#define PROF_VNAME_START INSTR_PROF_SECT_START(INSTR_PROF_VNAME_COMMON)
+#define PROF_VNAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_VNAME_COMMON)
 #define PROF_CNTS_START INSTR_PROF_SECT_START(INSTR_PROF_CNTS_COMMON)
 #define PROF_CNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_CNTS_COMMON)
+#define PROF_VTABLE_START INSTR_PROF_SECT_START(INSTR_PROF_VTAB_COMMON)
+#define PROF_VTABLE_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_VTAB_COMMON)
 #define PROF_BITS_START INSTR_PROF_SECT_START(INSTR_PROF_BITS_COMMON)
 #define PROF_BITS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_BITS_COMMON)
 #define PROF_ORDERFILE_START INSTR_PROF_SECT_START(INSTR_PROF_ORDERFILE_COMMON)
@@ -41,6 +45,10 @@ extern __llvm_profile_data PROF_DATA_STOP COMPILER_RT_VISIBILITY
     COMPILER_RT_WEAK;
 extern char PROF_CNTS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_CNTS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern VTableProfData PROF_VTABLE_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern VTableProfData PROF_VTABLE_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern char PROF_VNAME_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern char PROF_VNAME_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_BITS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_BITS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern uint32_t PROF_ORDERFILE_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
@@ -63,6 +71,19 @@ COMPILER_RT_VISIBILITY const char *__llvm_profile_begin_names(void) {
 COMPILER_RT_VISIBILITY const char *__llvm_profile_end_names(void) {
   return &PROF_NAME_STOP;
 }
+COMPILER_RT_VISIBILITY const char *__llvm_profile_begin_vtabnames(void) {
+  return &PROF_VNAME_START;
+}
+COMPILER_RT_VISIBILITY const char *__llvm_profile_end_vtabnames(void) {
+  return &PROF_VNAME_STOP;
+}
+COMPILER_RT_VISIBILITY const VTableProfData *
+__llvm_profile_begin_vtables(void) {
+  return &PROF_VTABLE_START;
+}
+COMPILER_RT_VISIBILITY const VTableProfData *__llvm_profile_end_vtables(void) {
+  return &PROF_VTABLE_STOP;
+}
 COMPILER_RT_VISIBILITY char *__llvm_profile_begin_counters(void) {
   return &PROF_CNTS_START;
 }
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformOther.c b/compiler-rt/lib/profile/InstrProfilingPlatformOther.c
index 5319ca813b43..aa79a5641cec 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformOther.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformOther.c
@@ -18,8 +18,12 @@
 
 static const __llvm_profile_data *DataFirst = NULL;
 static const __llvm_profile_data *DataLast = NULL;
+static const VTableProfData *VTableProfDataFirst = NULL;
+static const VTableProfData *VTableProfDataLast = NULL;
 static const char *NamesFirst = NULL;
 static const char *NamesLast = NULL;
+static const char *VNamesFirst = NULL;
+static const char *VNamesLast = NULL;
 static char *CountersFirst = NULL;
 static char *CountersLast = NULL;
 static uint32_t *OrderFileFirst = NULL;
@@ -80,11 +84,22 @@ COMPILER_RT_VISIBILITY
 const __llvm_profile_data *__llvm_profile_begin_data(void) { return DataFirst; }
 COMPILER_RT_VISIBILITY
 const __llvm_profile_data *__llvm_profile_end_data(void) { return DataLast; }
+COMPILER_RT_VISIBILITY const VTableProfData *
+__llvm_profile_begin_vtables(void) {
+  return VTableProfDataFirst;
+}
+COMPILER_RT_VISIBILITY const VTableProfData *__llvm_profile_end_vtables(void) {
+  return VTableProfDataLast;
+}
 COMPILER_RT_VISIBILITY
 const char *__llvm_profile_begin_names(void) { return NamesFirst; }
 COMPILER_RT_VISIBILITY
 const char *__llvm_profile_end_names(void) { return NamesLast; }
 COMPILER_RT_VISIBILITY
+const char *__llvm_profile_begin_vtabnames(void) { return VNamesFirst; }
+COMPILER_RT_VISIBILITY
+const char *__llvm_profile_end_vtabnames(void) { return VNamesLast; }
+COMPILER_RT_VISIBILITY
 char *__llvm_profile_begin_counters(void) { return CountersFirst; }
 COMPILER_RT_VISIBILITY
 char *__llvm_profile_end_counters(void) { return CountersLast; }
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c b/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c
index 0751b28f81d0..b9642ca7f681 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c
@@ -6,6 +6,8 @@
 |*
 \*===----------------------------------------------------------------------===*/
 
+#include <stddef.h>
+
 #include "InstrProfiling.h"
 #include "InstrProfilingInternal.h"
 
@@ -59,9 +61,26 @@ const __llvm_profile_data *__llvm_profile_begin_data(void) {
 }
 const __llvm_profile_data *__llvm_profile_end_data(void) { return &DataEnd; }
 
+// Type profiling isn't implemented under MSVC ABI, so return NULL (rather than
+// implementing linker magic on Windows) to make it more explicit. To elaborate,
+// the current type profiling implementation maps a profiled vtable address to a
+// vtable variable through vtables mangled name. Under MSVC ABI, the variable
+// name for vtables might not be the mangled name (see
+// MicrosoftCXXABI::getAddrOfVTable in MicrosoftCXXABI.cpp for more details on
+// how a vtable name is computed). Note the mangled name is still in the vtable
+// IR (just not variable name) for mapping purpose, but more implementation work
+// is required.
+const VTableProfData *__llvm_profile_begin_vtables(void) { return NULL; }
+const VTableProfData *__llvm_profile_end_vtables(void) { return NULL; }
+
 const char *__llvm_profile_begin_names(void) { return &NamesStart + 1; }
 const char *__llvm_profile_end_names(void) { return &NamesEnd; }
 
+// Type profiling isn't supported on Windows, so return NULl to make it more
+// explicit.
+const char *__llvm_profile_begin_vtabnames(void) { return NULL; }
+const char *__llvm_profile_end_vtabnames(void) { return NULL; }
+
 char *__llvm_profile_begin_counters(void) { return &CountersStart + 1; }
 char *__llvm_profile_end_counters(void) { return &CountersEnd; }
 char *__llvm_profile_begin_bitmap(void) { return &BitmapStart + 1; }
diff --git a/compiler-rt/lib/profile/InstrProfilingWriter.c b/compiler-rt/lib/profile/InstrProfilingWriter.c
index 4d767d138514..8816a7115551 100644
--- a/compiler-rt/lib/profile/InstrProfilingWriter.c
+++ b/compiler-rt/lib/profile/InstrProfilingWriter.c
@@ -250,9 +250,14 @@ COMPILER_RT_VISIBILITY int lprofWriteData(ProfDataWriter *Writer,
   const char *BitmapEnd = __llvm_profile_end_bitmap();
   const char *NamesBegin = __llvm_profile_begin_names();
   const char *NamesEnd = __llvm_profile_end_names();
+  const VTableProfData *VTableBegin = __llvm_profile_begin_vtables();
+  const VTableProfData *VTableEnd = __llvm_profile_end_vtables();
+  const char *VNamesBegin = __llvm_profile_begin_vtabnames();
+  const char *VNamesEnd = __llvm_profile_end_vtabnames();
   return lprofWriteDataImpl(Writer, DataBegin, DataEnd, CountersBegin,
                             CountersEnd, BitmapBegin, BitmapEnd, VPDataReader,
-                            NamesBegin, NamesEnd, SkipNameDataWrite);
+                            NamesBegin, NamesEnd, VTableBegin, VTableEnd,
+                            VNamesBegin, VNamesEnd, SkipNameDataWrite);
 }
 
 COMPILER_RT_VISIBILITY int
@@ -261,7 +266,9 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
                    const char *CountersBegin, const char *CountersEnd,
                    const char *BitmapBegin, const char *BitmapEnd,
                    VPDataReaderType *VPDataReader, const char *NamesBegin,
-                   const char *NamesEnd, int SkipNameDataWrite) {
+                   const char *NamesEnd, const VTableProfData *VTableBegin,
+                   const VTableProfData *VTableEnd, const char *VNamesBegin,
+                   const char *VNamesEnd, int SkipNameDataWrite) {
   /* Calculate size of sections. */
   const uint64_t DataSectionSize =
       __llvm_profile_get_data_size(DataBegin, DataEnd);
@@ -273,6 +280,12 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
   const uint64_t NumBitmapBytes =
       __llvm_profile_get_num_bitmap_bytes(BitmapBegin, BitmapEnd);
   const uint64_t NamesSize = __llvm_profile_get_name_size(NamesBegin, NamesEnd);
+  const uint64_t NumVTables =
+      __llvm_profile_get_num_vtable(VTableBegin, VTableEnd);
+  const uint64_t VTableSectionSize =
+      __llvm_profile_get_vtable_section_size(VTableBegin, VTableEnd);
+  const uint64_t VNamesSize =
+      __llvm_profile_get_name_size(VNamesBegin, VNamesEnd);
 
   /* Create the header. */
   __llvm_profile_header Header;
@@ -280,11 +293,15 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
   /* Determine how much padding is needed before/after the counters and after
    * the names. */
   uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
-      PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes;
-  __llvm_profile_get_padding_sizes_for_counters(
-      DataSectionSize, CountersSectionSize, NumBitmapBytes, NamesSize,
-      &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
-      &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames);
+      PaddingBytesAfterBitmapBytes, PaddingBytesAfterNames,
+      PaddingBytesAfterVTable, PaddingBytesAfterVNames;
+  if (__llvm_profile_get_padding_sizes_for_counters(
+          DataSectionSize, CountersSectionSize, NumBitmapBytes, NamesSize,
+          VTableSectionSize, VNamesSize, &PaddingBytesBeforeCounters,
+          &PaddingBytesAfterCounters, &PaddingBytesAfterBitmapBytes,
+          &PaddingBytesAfterNames, &PaddingBytesAfterVTable,
+          &PaddingBytesAfterVNames) == -1)
+    return -1;
 
   {
 /* Initialize header structure.  */
@@ -323,7 +340,11 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
       {BitmapBegin, sizeof(uint8_t), NumBitmapBytes, 0},
       {NULL, sizeof(uint8_t), PaddingBytesAfterBitmapBytes, 1},
       {SkipNameDataWrite ? NULL : NamesBegin, sizeof(uint8_t), NamesSize, 0},
-      {NULL, sizeof(uint8_t), PaddingBytesAfterNames, 1}};
+      {NULL, sizeof(uint8_t), PaddingBytesAfterNames, 1},
+      {VTableBegin, sizeof(uint8_t), VTableSectionSize, 0},
+      {NULL, sizeof(uint8_t), PaddingBytesAfterVTable, 1},
+      {SkipNameDataWrite ? NULL : VNamesBegin, sizeof(uint8_t), VNamesSize, 0},
+      {NULL, sizeof(uint8_t), PaddingBytesAfterVNames, 1}};
   if (Writer->Write(Writer, IOVecData, sizeof(IOVecData) / sizeof(*IOVecData)))
     return -1;
 
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp
index 479e4a0c184f..e810122a824f 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 #include "sanitizer_common/sanitizer_stackdepot.h"
 
+#include <algorithm>
 #include <atomic>
 #include <numeric>
 #include <regex>
diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index cd5a07be1576..fa6077384d98 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -1610,8 +1610,12 @@ private:
     // is very important.
     RB->RawStackDepotMap.unmap(RB->RawStackDepotMap.getBase(),
                                RB->RawStackDepotMap.getCapacity());
-    RB->RawRingBufferMap.unmap(RB->RawRingBufferMap.getBase(),
-                               RB->RawRingBufferMap.getCapacity());
+    // Note that the `RB->RawRingBufferMap` is stored on the pages managed by
+    // itself. Take over the ownership before calling unmap() so that any
+    // operation along with unmap() won't touch inaccessible pages.
+    MemMapT RawRingBufferMap = RB->RawRingBufferMap;
+    RawRingBufferMap.unmap(RawRingBufferMap.getBase(),
+                           RawRingBufferMap.getCapacity());
     atomic_store(&RingBufferAddress, 0, memory_order_release);
   }
 
diff --git a/compiler-rt/test/builtins/Unit/ctor_dtor.c b/compiler-rt/test/builtins/Unit/ctor_dtor.c
index 47560722a9f7..3d5f895a0a1c 100644
--- a/compiler-rt/test/builtins/Unit/ctor_dtor.c
+++ b/compiler-rt/test/builtins/Unit/ctor_dtor.c
@@ -9,23 +9,13 @@
 
 // Ensure the various startup functions are called in the proper order.
 
-// CHECK: __register_frame_info()
 /// ctor() is here if ld.so/libc supports DT_INIT/DT_FINI
 // CHECK:      main()
 /// dtor() is here if ld.so/libc supports DT_INIT/DT_FINI
-// CHECK:      __deregister_frame_info()
 
 struct object;
 static int counter;
 
-void __register_frame_info(const void *fi, struct object *obj) {
-  printf("__register_frame_info()\n");
-}
-
-void __deregister_frame_info(const void *fi) {
-  printf("__deregister_frame_info()\n");
-}
-
 void __attribute__((constructor)) ctor() {
   printf("ctor()\n");
   ++counter;
diff --git a/compiler-rt/test/profile/instrprof-basic.c b/compiler-rt/test/profile/instrprof-basic.c
index de66e1b27468..702f521ba4ed 100644
--- a/compiler-rt/test/profile/instrprof-basic.c
+++ b/compiler-rt/test/profile/instrprof-basic.c
@@ -1,6 +1,7 @@
 // RUN: %clang_profgen -o %t -O3 %s
 // RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t
 // RUN: llvm-profdata merge -o %t.profdata %t.profraw
+// RUN: llvm-profdata show --all-functions %t.profdata | FileCheck %s --check-prefix=PROFCNT
 // RUN: %clang_profuse=%t.profdata -o - -S -emit-llvm %s | FileCheck %s --check-prefix=COMMON --check-prefix=ORIG
 //
 // RUN: rm -fr %t.dir1
@@ -8,6 +9,7 @@
 // RUN: env LLVM_PROFILE_FILE=%t.dir1/profraw_e_%1m %run %t
 // RUN: env LLVM_PROFILE_FILE=%t.dir1/profraw_e_%1m %run %t
 // RUN: llvm-profdata merge -o %t.em.profdata %t.dir1
+// RUN: llvm-profdata show --all-functions %t.em.profdata | FileCheck %s --check-prefix=PROFCNT
 // RUN: %clang_profuse=%t.em.profdata -o - -S -emit-llvm %s | FileCheck %s --check-prefix=COMMON --check-prefix=MERGE
 //
 // RUN: rm -fr %t.dir2
@@ -16,6 +18,7 @@
 // RUN: %run %t.merge
 // RUN: %run %t.merge
 // RUN: llvm-profdata merge -o %t.m.profdata %t.dir2/
+// RUN: llvm-profdata show --all-functions %t.m.profdata | FileCheck %s --check-prefix=PROFCNT
 // RUN: %clang_profuse=%t.m.profdata -o - -S -emit-llvm %s | FileCheck %s --check-prefix=COMMON --check-prefix=MERGE
 //
 // Test that merging is enabled by default with -fprofile-generate=
@@ -27,6 +30,7 @@
 // RUN: %run %t.merge3
 // RUN: %run %t.merge3
 // RUN: llvm-profdata merge -o %t.m3.profdata %t.dir3/
+// RUN: llvm-profdata show --all-functions %t.m3.profdata | FileCheck %s --check-prefix=PROFCNT
 // RUN: %clang_profuse=%t.m3.profdata -O0 -o - -S -emit-llvm %s | FileCheck %s --check-prefix=COMMON --check-prefix=PGOMERGE
 //
 // Test that merging is enabled by default with -fprofile-generate
@@ -40,6 +44,7 @@
 // RUN: %run %t.dir4/merge4
 // RUN: rm -f %t.dir4/merge4*
 // RUN: llvm-profdata merge -o %t.m4.profdata ./
+// RUN: llvm-profdata show --all-functions %t.m4.profdata | FileCheck %s --check-prefix=PROFCNT
 // RUN: %clang_profuse=%t.m4.profdata -O0 -o - -S -emit-llvm %s | FileCheck %s --check-prefix=COMMON  --check-prefix=PGOMERGE
 
 /// Test that the merge pool size can be larger than 10.
@@ -49,6 +54,13 @@
 // RUN: not ls %t.dir5/e_%20m.profraw
 // RUN: ls %t.dir5/e_*.profraw | count 1
 
+// Test that all three functions have counters in the profile.
+// PROFCNT-DAG: begin
+// PROFCNT-DAG: end
+// PROFCNT-DAG: main
+// PROFCNT: Functions shown: 3
+// PROFCNT: Total functions: 3
+
 int begin(int i) {
   // COMMON: br i1 %{{.*}}, label %{{.*}}, label %{{.*}}, !prof ![[PD1:[0-9]+]]
   if (i)
diff --git a/compiler-rt/test/profile/instrprof-write-buffer-internal.c b/compiler-rt/test/profile/instrprof-write-buffer-internal.c
index d9670f739ca9..2c1c29ac0c58 100644
--- a/compiler-rt/test/profile/instrprof-write-buffer-internal.c
+++ b/compiler-rt/test/profile/instrprof-write-buffer-internal.c
@@ -31,7 +31,8 @@ char *__llvm_profile_end_bitmap(void);
 uint64_t __llvm_profile_get_size_for_buffer_internal(
     const void *DataBegin, const void *DataEnd, const char *CountersBegin,
     const char *CountersEnd, const char *BitmapBegin, const char *BitmapEnd,
-    const char *NamesBegin, const char *NamesEnd);
+    const char *NamesBegin, const char *NamesEnd, const void *VTableBegin,
+    const void *VTableEnd, const char *VNamesBegin, const char *VNamesEnd);
 
 int __llvm_profile_write_buffer_internal(
     char *Buffer, const void *DataBegin, const void *DataEnd,
@@ -45,7 +46,8 @@ int main(int argc, const char *argv[]) {
       __llvm_profile_begin_data(), __llvm_profile_end_data(),
       __llvm_profile_begin_counters(), __llvm_profile_end_counters(),
       __llvm_profile_begin_bitmap(), __llvm_profile_end_bitmap(),
-      __llvm_profile_begin_names(), __llvm_profile_end_names());
+      __llvm_profile_begin_names(), __llvm_profile_end_names(), NULL, NULL,
+      NULL, NULL);
 
   char *buf = malloc(bufsize);
   int ret = __llvm_profile_write_buffer_internal(
diff --git a/flang/include/flang/Common/float128.h b/flang/include/flang/Common/float128.h
index 3443aa06437b..2e76bc0a162e 100644
--- a/flang/include/flang/Common/float128.h
+++ b/flang/include/flang/Common/float128.h
@@ -20,6 +20,8 @@
 #ifndef FORTRAN_COMMON_FLOAT128_H_
 #define FORTRAN_COMMON_FLOAT128_H_
 
+#include <float.h>
+
 #ifdef __cplusplus
 /*
  * libc++ does not fully support __float128 right now, e.g.
@@ -49,4 +51,25 @@
 #endif /* (defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__)) && \
           !defined(_LIBCPP_VERSION)  && !defined(__CUDA_ARCH__) */
 
+/* Define pure C CFloat128Type and CFloat128ComplexType. */
+#if LDBL_MANT_DIG == 113
+typedef long double CFloat128Type;
+#ifndef __cplusplus
+typedef long double _Complex CFloat128ComplexType;
+#endif
+#elif HAS_FLOAT128
+typedef __float128 CFloat128Type;
+
+#ifndef __cplusplus
+/*
+ * Use mode() attribute supported by GCC and Clang.
+ * Adjust it for other compilers as needed.
+ */
+#if !defined(_ARCH_PPC) || defined(__LONG_DOUBLE_IEEE128__)
+typedef _Complex float __attribute__((mode(TC))) CFloat128ComplexType;
+#else
+typedef _Complex float __attribute__((mode(KC))) CFloat128ComplexType;
+#endif
+#endif // __cplusplus
+#endif
 #endif /* FORTRAN_COMMON_FLOAT128_H_ */
diff --git a/flang/include/flang/ISO_Fortran_binding.h b/flang/include/flang/ISO_Fortran_binding.h
index 4a28d3322a38..3f74a7e56f17 100644
--- a/flang/include/flang/ISO_Fortran_binding.h
+++ b/flang/include/flang/ISO_Fortran_binding.h
@@ -125,7 +125,7 @@ namespace cfi_internal {
 // The below structure emulates a flexible array. This structure does not take
 // care of getting the memory storage. Note that it already contains one element
 // because a struct cannot be empty.
-template <typename T> struct FlexibleArray : T {
+extern "C++" template <typename T> struct FlexibleArray : T {
   RT_API_ATTRS T &operator[](int index) { return *(this + index); }
   const RT_API_ATTRS T &operator[](int index) const { return *(this + index); }
   RT_API_ATTRS operator T *() { return this; }
@@ -163,12 +163,12 @@ typedef struct CFI_cdesc_t {
 // needed, for C++'s CFI_cdesc_t's emulated flexible
 // dim[] array.
 namespace cfi_internal {
-template <int r> struct CdescStorage : public CFI_cdesc_t {
+extern "C++" template <int r> struct CdescStorage : public CFI_cdesc_t {
   static_assert((r > 1 && r <= CFI_MAX_RANK), "CFI_INVALID_RANK");
   CFI_dim_t dim[r - 1];
 };
-template <> struct CdescStorage<1> : public CFI_cdesc_t {};
-template <> struct CdescStorage<0> : public CFI_cdesc_t {};
+extern "C++" template <> struct CdescStorage<1> : public CFI_cdesc_t {};
+extern "C++" template <> struct CdescStorage<0> : public CFI_cdesc_t {};
 } // namespace cfi_internal
 #define CFI_CDESC_T(rank) \
   FORTRAN_ISO_NAMESPACE_::cfi_internal::CdescStorage<rank>
diff --git a/flang/include/flang/Lower/AbstractConverter.h b/flang/include/flang/Lower/AbstractConverter.h
index e2af59e0aaa1..32e7a5e2b040 100644
--- a/flang/include/flang/Lower/AbstractConverter.h
+++ b/flang/include/flang/Lower/AbstractConverter.h
@@ -53,6 +53,7 @@ class DerivedTypeSpec;
 
 namespace lower {
 class SymMap;
+struct SymbolBox;
 namespace pft {
 struct Variable;
 }
@@ -299,6 +300,11 @@ public:
     return loweringOptions;
   }
 
+  /// Find the symbol in one level up of symbol map such as for host-association
+  /// in OpenMP code or return null.
+  virtual Fortran::lower::SymbolBox
+  lookupOneLevelUpSymbol(const Fortran::semantics::Symbol &sym) = 0;
+
 private:
   /// Options controlling lowering behavior.
   const Fortran::lower::LoweringOptions &loweringOptions;
diff --git a/flang/include/flang/Lower/PFTBuilder.h b/flang/include/flang/Lower/PFTBuilder.h
index c2b0fdbf357c..9913f584133f 100644
--- a/flang/include/flang/Lower/PFTBuilder.h
+++ b/flang/include/flang/Lower/PFTBuilder.h
@@ -138,7 +138,8 @@ using Directives =
     std::tuple<parser::CompilerDirective, parser::OpenACCConstruct,
                parser::OpenACCRoutineConstruct,
                parser::OpenACCDeclarativeConstruct, parser::OpenMPConstruct,
-               parser::OpenMPDeclarativeConstruct, parser::OmpEndLoopDirective>;
+               parser::OpenMPDeclarativeConstruct, parser::OmpEndLoopDirective,
+               parser::CUFKernelDoConstruct>;
 
 using DeclConstructs = std::tuple<parser::OpenMPDeclarativeConstruct,
                                   parser::OpenACCDeclarativeConstruct>;
@@ -178,7 +179,7 @@ static constexpr bool isNopConstructStmt{common::HasMember<
 template <typename A>
 static constexpr bool isExecutableDirective{common::HasMember<
     A, std::tuple<parser::CompilerDirective, parser::OpenACCConstruct,
-                  parser::OpenMPConstruct>>};
+                  parser::OpenMPConstruct, parser::CUFKernelDoConstruct>>};
 
 template <typename A>
 static constexpr bool isFunctionLike{common::HasMember<
diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
index 39821f1036c6..bd9b67b14b96 100644
--- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
@@ -688,6 +688,9 @@ fir::BoxValue createBoxValue(fir::FirOpBuilder &builder, mlir::Location loc,
 /// Generate Null BoxProc for procedure pointer null initialization.
 mlir::Value createNullBoxProc(fir::FirOpBuilder &builder, mlir::Location loc,
                               mlir::Type boxType);
+
+/// Set internal linkage attribute on a function.
+void setInternalLinkage(mlir::func::FuncOp);
 } // namespace fir::factory
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_FIRBUILDER_H
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index 08239230f793..db5e5f4bc682 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -3127,4 +3127,31 @@ def fir_BoxOffsetOp : fir_Op<"box_offset", [NoMemoryEffect]> {
   ];
 }
 
+def fir_CUDAKernelOp : fir_Op<"cuda_kernel", [AttrSizedOperandSegments,
+    DeclareOpInterfaceMethods<LoopLikeOpInterface>]> {
+
+  let arguments = (ins
+    Variadic<I32>:$grid, // empty means `*`
+    Variadic<I32>:$block, // empty means `*`
+    Optional<I32>:$stream,
+    Variadic<Index>:$lowerbound,
+    Variadic<Index>:$upperbound,
+    Variadic<Index>:$step,
+    OptionalAttr<I64Attr>:$n
+  );
+
+  let regions = (region AnyRegion:$region);
+
+  let assemblyFormat = [{
+    `<` `<` `<` custom<CUFKernelValues>($grid, type($grid)) `,` 
+                custom<CUFKernelValues>($block, type($block))
+        ( `,` `stream` `=` $stream^ )? `>` `>` `>`
+        custom<CUFKernelLoopControl>($region, $lowerbound, type($lowerbound),
+            $upperbound, type($upperbound), $step, type($step))
+        attr-dict
+  }];
+
+  let hasVerifier = 1;
+}
+
 #endif
diff --git a/flang/include/flang/Runtime/reduction.h b/flang/include/flang/Runtime/reduction.h
index b91fec0cd26b..5b6077658575 100644
--- a/flang/include/flang/Runtime/reduction.h
+++ b/flang/include/flang/Runtime/reduction.h
@@ -92,9 +92,11 @@ void RTDECL(CppSumComplex8)(std::complex<double> &, const Descriptor &,
 void RTDECL(CppSumComplex10)(std::complex<long double> &, const Descriptor &,
     const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
-void RTDECL(CppSumComplex16)(std::complex<long double> &, const Descriptor &,
-    const char *source, int line, int dim = 0,
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+void RTDECL(CppSumComplex16)(std::complex<CppFloat128Type> &,
+    const Descriptor &, const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
+#endif
 
 void RTDECL(SumDim)(Descriptor &result, const Descriptor &array, int dim,
     const char *source, int line, const Descriptor *mask = nullptr);
@@ -145,12 +147,16 @@ void RTDECL(CppProductComplex4)(std::complex<float> &, const Descriptor &,
 void RTDECL(CppProductComplex8)(std::complex<double> &, const Descriptor &,
     const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
+#if LDBL_MANT_DIG == 64
 void RTDECL(CppProductComplex10)(std::complex<long double> &,
     const Descriptor &, const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
-void RTDECL(CppProductComplex16)(std::complex<long double> &,
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+void RTDECL(CppProductComplex16)(std::complex<CppFloat128Type> &,
     const Descriptor &, const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
+#endif
 
 void RTDECL(ProductDim)(Descriptor &result, const Descriptor &array, int dim,
     const char *source, int line, const Descriptor *mask = nullptr);
@@ -358,9 +364,12 @@ double RTDECL(Norm2_8)(
 #if LDBL_MANT_DIG == 64
 long double RTDECL(Norm2_10)(
     const Descriptor &, const char *source, int line, int dim = 0);
-#elif LDBL_MANT_DIG == 113
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 long double RTDECL(Norm2_16)(
     const Descriptor &, const char *source, int line, int dim = 0);
+void RTDECL(Norm2DimReal16)(
+    Descriptor &, const Descriptor &, int dim, const char *source, int line);
 #endif
 void RTDECL(Norm2Dim)(
     Descriptor &, const Descriptor &, int dim, const char *source, int line);
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 83555e7cd82e..153ce0623ab3 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -1000,6 +1000,17 @@ private:
       if (sym.detailsIf<Fortran::semantics::CommonBlockDetails>())
         return symMap->lookupSymbol(sym);
 
+      // For symbols to be privatized in OMP, the symbol is mapped to an
+      // instance of `SymbolBox::Intrinsic` (i.e. a direct mapping to an MLIR
+      // SSA value). This MLIR SSA value is the block argument to the
+      // `omp.private`'s `alloc` block. If this is the case, we return this
+      // `SymbolBox::Intrinsic` value.
+      if (Fortran::lower::SymbolBox v = symMap->lookupSymbol(sym))
+        return v.match(
+            [&](const Fortran::lower::SymbolBox::Intrinsic &)
+                -> Fortran::lower::SymbolBox { return v; },
+            [](const auto &) -> Fortran::lower::SymbolBox { return {}; });
+
       return {};
     }
     if (Fortran::lower::SymbolBox v = symMap->lookupSymbol(sym))
@@ -1018,7 +1029,7 @@ private:
   /// Find the symbol in one level up of symbol map such as for host-association
   /// in OpenMP code or return null.
   Fortran::lower::SymbolBox
-  lookupOneLevelUpSymbol(const Fortran::semantics::Symbol &sym) {
+  lookupOneLevelUpSymbol(const Fortran::semantics::Symbol &sym) override {
     if (Fortran::lower::SymbolBox v = localSymbols.lookupOneLevelUpSymbol(sym))
       return v;
     return {};
@@ -2474,6 +2485,135 @@ private:
     // Handled by genFIR(const Fortran::parser::OpenACCDeclarativeConstruct &)
   }
 
+  void genFIR(const Fortran::parser::CUFKernelDoConstruct &kernel) {
+    localSymbols.pushScope();
+    const Fortran::parser::CUFKernelDoConstruct::Directive &dir =
+        std::get<Fortran::parser::CUFKernelDoConstruct::Directive>(kernel.t);
+
+    mlir::Location loc = genLocation(dir.source);
+
+    Fortran::lower::StatementContext stmtCtx;
+
+    unsigned nestedLoops = 1;
+
+    const auto &nLoops =
+        std::get<std::optional<Fortran::parser::ScalarIntConstantExpr>>(dir.t);
+    if (nLoops)
+      nestedLoops = *Fortran::semantics::GetIntValue(*nLoops);
+
+    mlir::IntegerAttr n;
+    if (nestedLoops > 1)
+      n = builder->getIntegerAttr(builder->getI64Type(), nestedLoops);
+
+    const std::list<Fortran::parser::ScalarIntExpr> &grid = std::get<1>(dir.t);
+    const std::list<Fortran::parser::ScalarIntExpr> &block = std::get<2>(dir.t);
+    const std::optional<Fortran::parser::ScalarIntExpr> &stream =
+        std::get<3>(dir.t);
+
+    llvm::SmallVector<mlir::Value> gridValues;
+    for (const Fortran::parser::ScalarIntExpr &expr : grid)
+      gridValues.push_back(fir::getBase(
+          genExprValue(*Fortran::semantics::GetExpr(expr), stmtCtx)));
+    llvm::SmallVector<mlir::Value> blockValues;
+    for (const Fortran::parser::ScalarIntExpr &expr : block)
+      blockValues.push_back(fir::getBase(
+          genExprValue(*Fortran::semantics::GetExpr(expr), stmtCtx)));
+    mlir::Value streamValue;
+    if (stream)
+      streamValue = fir::getBase(
+          genExprValue(*Fortran::semantics::GetExpr(*stream), stmtCtx));
+
+    const auto &outerDoConstruct =
+        std::get<std::optional<Fortran::parser::DoConstruct>>(kernel.t);
+
+    llvm::SmallVector<mlir::Location> locs;
+    locs.push_back(loc);
+    llvm::SmallVector<mlir::Value> lbs, ubs, steps;
+
+    mlir::Type idxTy = builder->getIndexType();
+
+    llvm::SmallVector<mlir::Type> ivTypes;
+    llvm::SmallVector<mlir::Location> ivLocs;
+    llvm::SmallVector<mlir::Value> ivValues;
+    for (unsigned i = 0; i < nestedLoops; ++i) {
+      const Fortran::parser::LoopControl *loopControl;
+      Fortran::lower::pft::Evaluation *loopEval =
+          &getEval().getFirstNestedEvaluation();
+
+      mlir::Location crtLoc = loc;
+      if (i == 0) {
+        loopControl = &*outerDoConstruct->GetLoopControl();
+        crtLoc =
+            genLocation(Fortran::parser::FindSourceLocation(outerDoConstruct));
+      } else {
+        auto *doCons = loopEval->getIf<Fortran::parser::DoConstruct>();
+        assert(doCons && "expect do construct");
+        loopControl = &*doCons->GetLoopControl();
+        crtLoc = genLocation(Fortran::parser::FindSourceLocation(*doCons));
+      }
+
+      locs.push_back(crtLoc);
+
+      const Fortran::parser::LoopControl::Bounds *bounds =
+          std::get_if<Fortran::parser::LoopControl::Bounds>(&loopControl->u);
+      assert(bounds && "Expected bounds on the loop construct");
+
+      Fortran::semantics::Symbol &ivSym =
+          bounds->name.thing.symbol->GetUltimate();
+      ivValues.push_back(getSymbolAddress(ivSym));
+
+      lbs.push_back(builder->createConvert(
+          crtLoc, idxTy,
+          fir::getBase(genExprValue(*Fortran::semantics::GetExpr(bounds->lower),
+                                    stmtCtx))));
+      ubs.push_back(builder->createConvert(
+          crtLoc, idxTy,
+          fir::getBase(genExprValue(*Fortran::semantics::GetExpr(bounds->upper),
+                                    stmtCtx))));
+      if (bounds->step)
+        steps.push_back(fir::getBase(
+            genExprValue(*Fortran::semantics::GetExpr(bounds->step), stmtCtx)));
+      else // If `step` is not present, assume it is `1`.
+        steps.push_back(builder->createIntegerConstant(loc, idxTy, 1));
+
+      ivTypes.push_back(idxTy);
+      ivLocs.push_back(crtLoc);
+      if (i < nestedLoops - 1)
+        loopEval = &*std::next(loopEval->getNestedEvaluations().begin());
+    }
+
+    auto op = builder->create<fir::CUDAKernelOp>(
+        loc, gridValues, blockValues, streamValue, lbs, ubs, steps, n);
+    builder->createBlock(&op.getRegion(), op.getRegion().end(), ivTypes,
+                         ivLocs);
+    mlir::Block &b = op.getRegion().back();
+    builder->setInsertionPointToStart(&b);
+
+    for (auto [arg, value] : llvm::zip(
+             op.getLoopRegions().front()->front().getArguments(), ivValues)) {
+      mlir::Value convArg =
+          builder->createConvert(loc, fir::unwrapRefType(value.getType()), arg);
+      builder->create<fir::StoreOp>(loc, convArg, value);
+    }
+
+    builder->create<fir::FirEndOp>(loc);
+    builder->setInsertionPointToStart(&b);
+
+    Fortran::lower::pft::Evaluation *crtEval = &getEval();
+    if (crtEval->lowerAsStructured()) {
+      crtEval = &crtEval->getFirstNestedEvaluation();
+      for (int64_t i = 1; i < nestedLoops; i++)
+        crtEval = &*std::next(crtEval->getNestedEvaluations().begin());
+    }
+
+    // Generate loop body
+    for (Fortran::lower::pft::Evaluation &e : crtEval->getNestedEvaluations())
+      genFIR(e);
+
+    builder->setInsertionPointAfter(op);
+    localSymbols.popScope();
+  }
+
   void genFIR(const Fortran::parser::OpenMPConstruct &omp) {
     mlir::OpBuilder::InsertPoint insertPt = builder->saveInsertionPoint();
     genOpenMPConstruct(*this, localSymbols, bridge.getSemanticsContext(),
@@ -4348,7 +4488,16 @@ private:
     assert(builder && "FirOpBuilder did not instantiate");
     builder->setFastMathFlags(bridge.getLoweringOptions().getMathOptions());
     builder->setInsertionPointToStart(&func.front());
-    func.setVisibility(mlir::SymbolTable::Visibility::Public);
+    if (funit.parent.isA<Fortran::lower::pft::FunctionLikeUnit>()) {
+      // Give internal linkage to internal functions. There are no name clash
+      // risks, but giving global linkage to internal procedure will break the
+      // static link register in shared libraries because of the system calls.
+      // Also, it should be possible to eliminate the procedure code if all the
+      // uses have been inlined.
+      fir::factory::setInternalLinkage(func);
+    } else {
+      func.setVisibility(mlir::SymbolTable::Visibility::Public);
+    }
     assert(blockId == 0 && "invalid blockId");
     assert(activeConstructStack.empty() && "invalid construct stack state");
 
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index b2279a319fe9..a673a18cd20d 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -172,9 +172,12 @@ static fir::GlobalOp declareGlobal(Fortran::lower::AbstractConverter &converter,
       !Fortran::semantics::IsProcedurePointer(ultimate))
     mlir::emitError(loc, "processing global declaration: symbol '")
         << toStringRef(sym.name()) << "' has unexpected details\n";
+  fir::CUDADataAttributeAttr cudaAttr =
+      Fortran::lower::translateSymbolCUDADataAttribute(
+          converter.getFirOpBuilder().getContext(), sym);
   return builder.createGlobal(loc, converter.genType(var), globalName, linkage,
                               mlir::Attribute{}, isConstant(ultimate),
-                              var.isTarget());
+                              var.isTarget(), cudaAttr);
 }
 
 /// Temporary helper to catch todos in initial data target lowering.
@@ -1586,7 +1589,7 @@ fir::FortranVariableFlagsAttr Fortran::lower::translateSymbolAttributes(
 fir::CUDADataAttributeAttr Fortran::lower::translateSymbolCUDADataAttribute(
     mlir::MLIRContext *mlirContext, const Fortran::semantics::Symbol &sym) {
   std::optional<Fortran::common::CUDADataAttr> cudaAttr =
-      Fortran::semantics::GetCUDADataAttr(&sym);
+      Fortran::semantics::GetCUDADataAttr(&sym.GetUltimate());
   return fir::getCUDADataAttribute(mlirContext, cudaAttr);
 }
 
diff --git a/flang/lib/Lower/HostAssociations.cpp b/flang/lib/Lower/HostAssociations.cpp
index a62f7a7e99b6..b9e13ccad1c9 100644
--- a/flang/lib/Lower/HostAssociations.cpp
+++ b/flang/lib/Lower/HostAssociations.cpp
@@ -247,9 +247,11 @@ public:
   }
 };
 
-/// Class defining how polymorphic entities are captured in internal procedures.
-/// Polymorphic entities are always boxed as a fir.class box.
-class CapturedPolymorphic : public CapturedSymbols<CapturedPolymorphic> {
+/// Class defining how polymorphic scalar entities are captured in internal
+/// procedures. Polymorphic entities are always boxed as a fir.class box.
+/// Polymorphic array can be handled in CapturedArrays directly
+class CapturedPolymorphicScalar
+    : public CapturedSymbols<CapturedPolymorphicScalar> {
 public:
   static mlir::Type getType(Fortran::lower::AbstractConverter &converter,
                             const Fortran::semantics::Symbol &sym) {
@@ -257,19 +259,50 @@ public:
   }
   static void instantiateHostTuple(const InstantiateHostTuple &args,
                                    Fortran::lower::AbstractConverter &converter,
-                                   const Fortran::semantics::Symbol &) {
+                                   const Fortran::semantics::Symbol &sym) {
     fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+    mlir::Location loc = args.loc;
     mlir::Type typeInTuple = fir::dyn_cast_ptrEleTy(args.addrInTuple.getType());
     assert(typeInTuple && "addrInTuple must be an address");
     mlir::Value castBox = builder.createConvert(args.loc, typeInTuple,
                                                 fir::getBase(args.hostValue));
-    builder.create<fir::StoreOp>(args.loc, castBox, args.addrInTuple);
+    if (Fortran::semantics::IsOptional(sym)) {
+      auto isPresent =
+          builder.create<fir::IsPresentOp>(loc, builder.getI1Type(), castBox);
+      builder.genIfThenElse(loc, isPresent)
+          .genThen([&]() {
+            builder.create<fir::StoreOp>(loc, castBox, args.addrInTuple);
+          })
+          .genElse([&]() {
+            mlir::Value null = fir::factory::createUnallocatedBox(
+                builder, loc, typeInTuple,
+                /*nonDeferredParams=*/mlir::ValueRange{});
+            builder.create<fir::StoreOp>(loc, null, args.addrInTuple);
+          })
+          .end();
+    } else {
+      builder.create<fir::StoreOp>(loc, castBox, args.addrInTuple);
+    }
   }
   static void getFromTuple(const GetFromTuple &args,
                            Fortran::lower::AbstractConverter &converter,
                            const Fortran::semantics::Symbol &sym,
                            const Fortran::lower::BoxAnalyzer &ba) {
-    bindCapturedSymbol(sym, args.valueInTuple, converter, args.symMap);
+    fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+    mlir::Location loc = args.loc;
+    mlir::Value box = args.valueInTuple;
+    if (Fortran::semantics::IsOptional(sym)) {
+      auto boxTy = box.getType().cast<fir::BaseBoxType>();
+      auto eleTy = boxTy.getEleTy();
+      if (!fir::isa_ref_type(eleTy))
+        eleTy = builder.getRefType(eleTy);
+      auto addr = builder.create<fir::BoxAddrOp>(loc, eleTy, box);
+      mlir::Value isPresent = builder.genIsNotNullAddr(loc, addr);
+      auto absentBox = builder.create<fir::AbsentOp>(loc, boxTy);
+      box =
+          builder.create<mlir::arith::SelectOp>(loc, isPresent, box, absentBox);
+    }
+    bindCapturedSymbol(sym, box, converter, args.symMap);
   }
 };
 
@@ -342,7 +375,12 @@ public:
   static mlir::Type getType(Fortran::lower::AbstractConverter &converter,
                             const Fortran::semantics::Symbol &sym) {
     mlir::Type type = converter.genType(sym);
-    assert(type.isa<fir::SequenceType>() && "must be a sequence type");
+    bool isPolymorphic = Fortran::semantics::IsPolymorphic(sym);
+    assert((type.isa<fir::SequenceType>() ||
+            (isPolymorphic && type.isa<fir::ClassType>())) &&
+           "must be a sequence type");
+    if (isPolymorphic)
+      return type;
     return fir::BoxType::get(type);
   }
 
@@ -410,13 +448,13 @@ public:
                          fir::factory::readBoxValue(builder, loc, boxValue),
                          converter, args.symMap);
     } else {
-      // Keep variable as a fir.box.
+      // Keep variable as a fir.box/fir.class.
       // If this is an optional that is absent, the fir.box needs to be an
       // AbsentOp result, otherwise it will not work properly with IsPresentOp
       // (absent boxes are null descriptor addresses, not descriptors containing
       // a null base address).
       if (Fortran::semantics::IsOptional(sym)) {
-        auto boxTy = box.getType().cast<fir::BoxType>();
+        auto boxTy = box.getType().cast<fir::BaseBoxType>();
         auto eleTy = boxTy.getEleTy();
         if (!fir::isa_ref_type(eleTy))
           eleTy = builder.getRefType(eleTy);
@@ -470,14 +508,10 @@ walkCaptureCategories(T visitor, Fortran::lower::AbstractConverter &converter,
   ba.analyze(sym);
   if (Fortran::semantics::IsAllocatableOrPointer(sym))
     return CapturedAllocatableAndPointer::visit(visitor, converter, sym, ba);
-  if (Fortran::semantics::IsPolymorphic(sym)) {
-    if (ba.isArray() && !ba.lboundIsAllOnes())
-      TODO(converter.genLocation(sym.name()),
-           "polymorphic array with non default lower bound");
-    return CapturedPolymorphic::visit(visitor, converter, sym, ba);
-  }
   if (ba.isArray())
     return CapturedArrays::visit(visitor, converter, sym, ba);
+  if (Fortran::semantics::IsPolymorphic(sym))
+    return CapturedPolymorphicScalar::visit(visitor, converter, sym, ba);
   if (ba.isChar())
     return CapturedCharacterScalars::visit(visitor, converter, sym, ba);
   assert(ba.isTrivial() && "must be trivial scalar");
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index 136bda0b582e..717b8cc0276a 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -14,6 +14,7 @@
 
 #include "Utils.h"
 #include "flang/Lower/PFTBuilder.h"
+#include "flang/Lower/SymbolMap.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Semantics/tools.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
@@ -66,9 +67,10 @@ void DataSharingProcessor::cloneSymbol(const Fortran::semantics::Symbol *sym) {
 }
 
 void DataSharingProcessor::copyFirstPrivateSymbol(
-    const Fortran::semantics::Symbol *sym) {
+    const Fortran::semantics::Symbol *sym,
+    mlir::OpBuilder::InsertPoint *copyAssignIP) {
   if (sym->test(Fortran::semantics::Symbol::Flag::OmpFirstPrivate))
-    converter.copyHostAssociateVar(*sym);
+    converter.copyHostAssociateVar(*sym, copyAssignIP);
 }
 
 void DataSharingProcessor::copyLastPrivateSymbol(
@@ -307,14 +309,10 @@ void DataSharingProcessor::privatize() {
   for (const Fortran::semantics::Symbol *sym : privatizedSymbols) {
     if (const auto *commonDet =
             sym->detailsIf<Fortran::semantics::CommonBlockDetails>()) {
-      for (const auto &mem : commonDet->objects()) {
-        cloneSymbol(&*mem);
-        copyFirstPrivateSymbol(&*mem);
-      }
-    } else {
-      cloneSymbol(sym);
-      copyFirstPrivateSymbol(sym);
-    }
+      for (const auto &mem : commonDet->objects())
+        doPrivatize(&*mem);
+    } else
+      doPrivatize(sym);
   }
 }
 
@@ -338,11 +336,95 @@ void DataSharingProcessor::defaultPrivatize() {
         !sym->GetUltimate().has<Fortran::semantics::NamelistDetails>() &&
         !symbolsInNestedRegions.contains(sym) &&
         !symbolsInParentRegions.contains(sym) &&
-        !privatizedSymbols.contains(sym)) {
+        !privatizedSymbols.contains(sym))
+      doPrivatize(sym);
+  }
+}
+
+void DataSharingProcessor::doPrivatize(const Fortran::semantics::Symbol *sym) {
+  if (!useDelayedPrivatization) {
+    cloneSymbol(sym);
+    copyFirstPrivateSymbol(sym);
+    return;
+  }
+
+  Fortran::lower::SymbolBox hsb = converter.lookupOneLevelUpSymbol(*sym);
+  assert(hsb && "Host symbol box not found");
+
+  mlir::Type symType = hsb.getAddr().getType();
+  mlir::Location symLoc = hsb.getAddr().getLoc();
+  std::string privatizerName = sym->name().ToString() + ".privatizer";
+  bool isFirstPrivate =
+      sym->test(Fortran::semantics::Symbol::Flag::OmpFirstPrivate);
+
+  mlir::omp::PrivateClauseOp privatizerOp = [&]() {
+    auto moduleOp = firOpBuilder.getModule();
+    auto uniquePrivatizerName = fir::getTypeAsString(
+        symType, converter.getKindMap(),
+        converter.mangleName(*sym) +
+            (isFirstPrivate ? "_firstprivate" : "_private"));
+
+    if (auto existingPrivatizer =
+            moduleOp.lookupSymbol<mlir::omp::PrivateClauseOp>(
+                uniquePrivatizerName))
+      return existingPrivatizer;
+
+    auto ip = firOpBuilder.saveInsertionPoint();
+    firOpBuilder.setInsertionPoint(&moduleOp.getBodyRegion().front(),
+                                   moduleOp.getBodyRegion().front().begin());
+    auto result = firOpBuilder.create<mlir::omp::PrivateClauseOp>(
+        symLoc, uniquePrivatizerName, symType,
+        isFirstPrivate ? mlir::omp::DataSharingClauseType::FirstPrivate
+                       : mlir::omp::DataSharingClauseType::Private);
+
+    symTable->pushScope();
+
+    // Populate the `alloc` region.
+    {
+      mlir::Region &allocRegion = result.getAllocRegion();
+      mlir::Block *allocEntryBlock = firOpBuilder.createBlock(
+          &allocRegion, /*insertPt=*/{}, symType, symLoc);
+
+      firOpBuilder.setInsertionPointToEnd(allocEntryBlock);
+      symTable->addSymbol(*sym, allocRegion.getArgument(0));
+      symTable->pushScope();
       cloneSymbol(sym);
-      copyFirstPrivateSymbol(sym);
+      firOpBuilder.create<mlir::omp::YieldOp>(
+          hsb.getAddr().getLoc(),
+          symTable->shallowLookupSymbol(*sym).getAddr());
+      symTable->popScope();
     }
-  }
+
+    // Populate the `copy` region if this is a `firstprivate`.
+    if (isFirstPrivate) {
+      mlir::Region &copyRegion = result.getCopyRegion();
+      // First block argument corresponding to the original/host value while
+      // second block argument corresponding to the privatized value.
+      mlir::Block *copyEntryBlock = firOpBuilder.createBlock(
+          &copyRegion, /*insertPt=*/{}, {symType, symType}, {symLoc, symLoc});
+      firOpBuilder.setInsertionPointToEnd(copyEntryBlock);
+      symTable->addSymbol(*sym, copyRegion.getArgument(0),
+                          /*force=*/true);
+      symTable->pushScope();
+      symTable->addSymbol(*sym, copyRegion.getArgument(1));
+      auto ip = firOpBuilder.saveInsertionPoint();
+      copyFirstPrivateSymbol(sym, &ip);
+
+      firOpBuilder.create<mlir::omp::YieldOp>(
+          hsb.getAddr().getLoc(),
+          symTable->shallowLookupSymbol(*sym).getAddr());
+      symTable->popScope();
+    }
+
+    symTable->popScope();
+    firOpBuilder.restoreInsertionPoint(ip);
+    return result;
+  }();
+
+  delayedPrivatizationInfo.privatizers.push_back(
+      mlir::SymbolRefAttr::get(privatizerOp));
+  delayedPrivatizationInfo.originalAddresses.push_back(hsb.getAddr());
+  delayedPrivatizationInfo.symbols.push_back(sym);
 }
 
 } // namespace omp
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.h b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
index 10c0a30c09c3..9f7301df0759 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.h
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
@@ -23,6 +23,24 @@ namespace lower {
 namespace omp {
 
 class DataSharingProcessor {
+public:
+  /// Collects all the information needed for delayed privatization. This can be
+  /// used by ops with data-sharing clauses to properly generate their regions
+  /// (e.g. add region arguments) and map the original SSA values to their
+  /// corresponding OMP region operands.
+  struct DelayedPrivatizationInfo {
+    // The list of symbols referring to delayed privatizer ops (i.e.
+    // `omp.private` ops).
+    llvm::SmallVector<mlir::SymbolRefAttr> privatizers;
+    // SSA values that correspond to "original" values being privatized.
+    // "Original" here means the SSA value outside the OpenMP region from which
+    // a clone is created inside the region.
+    llvm::SmallVector<mlir::Value> originalAddresses;
+    // Fortran symbols corresponding to the above SSA values.
+    llvm::SmallVector<const Fortran::semantics::Symbol *> symbols;
+  };
+
+private:
   bool hasLastPrivateOp;
   mlir::OpBuilder::InsertPoint lastPrivIP;
   mlir::OpBuilder::InsertPoint insPt;
@@ -36,6 +54,9 @@ class DataSharingProcessor {
   fir::FirOpBuilder &firOpBuilder;
   const Fortran::parser::OmpClauseList &opClauseList;
   Fortran::lower::pft::Evaluation &eval;
+  bool useDelayedPrivatization;
+  Fortran::lower::SymMap *symTable;
+  DelayedPrivatizationInfo delayedPrivatizationInfo;
 
   bool needBarrier();
   void collectSymbols(Fortran::semantics::Symbol::Flag flag);
@@ -47,10 +68,13 @@ class DataSharingProcessor {
   void collectDefaultSymbols();
   void privatize();
   void defaultPrivatize();
+  void doPrivatize(const Fortran::semantics::Symbol *sym);
   void copyLastPrivatize(mlir::Operation *op);
   void insertLastPrivateCompare(mlir::Operation *op);
   void cloneSymbol(const Fortran::semantics::Symbol *sym);
-  void copyFirstPrivateSymbol(const Fortran::semantics::Symbol *sym);
+  void
+  copyFirstPrivateSymbol(const Fortran::semantics::Symbol *sym,
+                         mlir::OpBuilder::InsertPoint *copyAssignIP = nullptr);
   void copyLastPrivateSymbol(const Fortran::semantics::Symbol *sym,
                              mlir::OpBuilder::InsertPoint *lastPrivIP);
   void insertDeallocs();
@@ -58,10 +82,14 @@ class DataSharingProcessor {
 public:
   DataSharingProcessor(Fortran::lower::AbstractConverter &converter,
                        const Fortran::parser::OmpClauseList &opClauseList,
-                       Fortran::lower::pft::Evaluation &eval)
+                       Fortran::lower::pft::Evaluation &eval,
+                       bool useDelayedPrivatization = false,
+                       Fortran::lower::SymMap *symTable = nullptr)
       : hasLastPrivateOp(false), converter(converter),
         firOpBuilder(converter.getFirOpBuilder()), opClauseList(opClauseList),
-        eval(eval) {}
+        eval(eval), useDelayedPrivatization(useDelayedPrivatization),
+        symTable(symTable) {}
+
   // Privatisation is split into two steps.
   // Step1 performs cloning of all privatisation clauses and copying for
   // firstprivates. Step1 is performed at the place where process/processStep1
@@ -80,6 +108,10 @@ public:
     assert(!loopIV && "Loop iteration variable already set");
     loopIV = iv;
   }
+
+  const DelayedPrivatizationInfo &getDelayedPrivatizationInfo() const {
+    return delayedPrivatizationInfo;
+  }
 };
 
 } // namespace omp
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 7953bf83cba0..21ad51c53d87 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -558,6 +558,7 @@ genOrderedRegionOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::ParallelOp
 genParallelOp(Fortran::lower::AbstractConverter &converter,
+              Fortran::lower::SymMap &symTable,
               Fortran::semantics::SemanticsContext &semaCtx,
               Fortran::lower::pft::Evaluation &eval, bool genNested,
               mlir::Location currentLocation,
@@ -590,8 +591,8 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
   auto reductionCallback = [&](mlir::Operation *op) {
     llvm::SmallVector<mlir::Location> locs(reductionVars.size(),
                                            currentLocation);
-    auto block = converter.getFirOpBuilder().createBlock(&op->getRegion(0), {},
-                                                         reductionTypes, locs);
+    auto *block = converter.getFirOpBuilder().createBlock(&op->getRegion(0), {},
+                                                          reductionTypes, locs);
     for (auto [arg, prv] :
          llvm::zip_equal(reductionSymbols, block->getArguments())) {
       converter.bindSymbol(*arg, prv);
@@ -599,13 +600,78 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
     return reductionSymbols;
   };
 
-  return genOpWithBody<mlir::omp::ParallelOp>(
+  OpWithBodyGenInfo genInfo =
       OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
           .setGenNested(genNested)
           .setOuterCombined(outerCombined)
           .setClauses(&clauseList)
           .setReductions(&reductionSymbols, &reductionTypes)
-          .setGenRegionEntryCb(reductionCallback),
+          .setGenRegionEntryCb(reductionCallback);
+
+  if (!enableDelayedPrivatization) {
+    return genOpWithBody<mlir::omp::ParallelOp>(
+        genInfo,
+        /*resultTypes=*/mlir::TypeRange(), ifClauseOperand,
+        numThreadsClauseOperand, allocateOperands, allocatorOperands,
+        reductionVars,
+        reductionDeclSymbols.empty()
+            ? nullptr
+            : mlir::ArrayAttr::get(converter.getFirOpBuilder().getContext(),
+                                   reductionDeclSymbols),
+        procBindKindAttr, /*private_vars=*/llvm::SmallVector<mlir::Value>{},
+        /*privatizers=*/nullptr);
+  }
+
+  bool privatize = !outerCombined;
+  DataSharingProcessor dsp(converter, clauseList, eval,
+                           /*useDelayedPrivatization=*/true, &symTable);
+
+  if (privatize)
+    dsp.processStep1();
+
+  const auto &delayedPrivatizationInfo = dsp.getDelayedPrivatizationInfo();
+
+  auto genRegionEntryCB = [&](mlir::Operation *op) {
+    auto parallelOp = llvm::cast<mlir::omp::ParallelOp>(op);
+
+    llvm::SmallVector<mlir::Location> reductionLocs(reductionVars.size(),
+                                                    currentLocation);
+
+    mlir::OperandRange privateVars = parallelOp.getPrivateVars();
+    mlir::Region &region = parallelOp.getRegion();
+
+    llvm::SmallVector<mlir::Type> privateVarTypes = reductionTypes;
+    privateVarTypes.reserve(privateVarTypes.size() + privateVars.size());
+    llvm::transform(privateVars, std::back_inserter(privateVarTypes),
+                    [](mlir::Value v) { return v.getType(); });
+
+    llvm::SmallVector<mlir::Location> privateVarLocs = reductionLocs;
+    privateVarLocs.reserve(privateVarLocs.size() + privateVars.size());
+    llvm::transform(privateVars, std::back_inserter(privateVarLocs),
+                    [](mlir::Value v) { return v.getLoc(); });
+
+    converter.getFirOpBuilder().createBlock(&region, /*insertPt=*/{},
+                                            privateVarTypes, privateVarLocs);
+
+    llvm::SmallVector<const Fortran::semantics::Symbol *> allSymbols =
+        reductionSymbols;
+    allSymbols.append(delayedPrivatizationInfo.symbols);
+    for (auto [arg, prv] : llvm::zip_equal(allSymbols, region.getArguments())) {
+      converter.bindSymbol(*arg, prv);
+    }
+
+    return allSymbols;
+  };
+
+  // TODO Merge with the reduction CB.
+  genInfo.setGenRegionEntryCb(genRegionEntryCB).setDataSharingProcessor(&dsp);
+
+  llvm::SmallVector<mlir::Attribute> privatizers(
+      delayedPrivatizationInfo.privatizers.begin(),
+      delayedPrivatizationInfo.privatizers.end());
+
+  return genOpWithBody<mlir::omp::ParallelOp>(
+      genInfo,
       /*resultTypes=*/mlir::TypeRange(), ifClauseOperand,
       numThreadsClauseOperand, allocateOperands, allocatorOperands,
       reductionVars,
@@ -613,8 +679,11 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
           ? nullptr
           : mlir::ArrayAttr::get(converter.getFirOpBuilder().getContext(),
                                  reductionDeclSymbols),
-      procBindKindAttr, /*private_vars=*/llvm::SmallVector<mlir::Value>{},
-      /*privatizers=*/nullptr);
+      procBindKindAttr, delayedPrivatizationInfo.originalAddresses,
+      delayedPrivatizationInfo.privatizers.empty()
+          ? nullptr
+          : mlir::ArrayAttr::get(converter.getFirOpBuilder().getContext(),
+                                 privatizers));
 }
 
 static mlir::omp::SectionOp
@@ -1621,7 +1690,7 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
     if ((llvm::omp::allParallelSet & llvm::omp::loopConstructSet)
             .test(ompDirective)) {
       validDirective = true;
-      genParallelOp(converter, semaCtx, eval, /*genNested=*/false,
+      genParallelOp(converter, symTable, semaCtx, eval, /*genNested=*/false,
                     currentLocation, loopOpClauseList,
                     /*outerCombined=*/true);
     }
@@ -1711,8 +1780,8 @@ genOMP(Fortran::lower::AbstractConverter &converter,
                        currentLocation);
     break;
   case llvm::omp::Directive::OMPD_parallel:
-    genParallelOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
-                  beginClauseList);
+    genParallelOp(converter, symTable, semaCtx, eval, /*genNested=*/true,
+                  currentLocation, beginClauseList);
     break;
   case llvm::omp::Directive::OMPD_single:
     genSingleOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
@@ -1769,7 +1838,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
           .test(directive.v)) {
     bool outerCombined =
         directive.v != llvm::omp::Directive::OMPD_target_parallel;
-    genParallelOp(converter, semaCtx, eval, /*genNested=*/false,
+    genParallelOp(converter, symTable, semaCtx, eval, /*genNested=*/false,
                   currentLocation, beginClauseList, outerCombined);
     combinedDirective = true;
   }
@@ -1852,7 +1921,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
 
   // Parallel wrapper of PARALLEL SECTIONS construct
   if (dir == llvm::omp::Directive::OMPD_parallel_sections) {
-    genParallelOp(converter, semaCtx, eval,
+    genParallelOp(converter, symTable, semaCtx, eval,
                   /*genNested=*/false, currentLocation, sectionsClauseList,
                   /*outerCombined=*/true);
   } else {
diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index 31b15257d186..49517f62895d 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -24,6 +24,12 @@ llvm::cl::opt<bool> treatIndexAsSection(
     llvm::cl::desc("In the OpenMP data clauses treat `a(N)` as `a(N:N)`."),
     llvm::cl::init(true));
 
+llvm::cl::opt<bool> enableDelayedPrivatization(
+    "openmp-enable-delayed-privatization",
+    llvm::cl::desc(
+        "Emit `[first]private` variables as clauses on the MLIR ops."),
+    llvm::cl::init(false));
+
 namespace Fortran {
 namespace lower {
 namespace omp {
diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/lib/Lower/OpenMP/Utils.h
index c346f891f079..f57cd7420ce4 100644
--- a/flang/lib/Lower/OpenMP/Utils.h
+++ b/flang/lib/Lower/OpenMP/Utils.h
@@ -15,6 +15,7 @@
 #include "llvm/Support/CommandLine.h"
 
 extern llvm::cl::opt<bool> treatIndexAsSection;
+extern llvm::cl::opt<bool> enableDelayedPrivatization;
 
 namespace fir {
 class FirOpBuilder;
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index 3cce39f5b8c7..788c99e40105 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -18,6 +18,7 @@
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
 #include "flang/Optimizer/Support/FatalError.h"
 #include "flang/Optimizer/Support/InternalNames.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/OpenACC/OpenACC.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -1533,3 +1534,10 @@ mlir::Value fir::factory::createNullBoxProc(fir::FirOpBuilder &builder,
   mlir::Value initVal{builder.create<fir::ZeroOp>(loc, boxEleTy)};
   return builder.create<fir::EmboxProcOp>(loc, boxTy, initVal);
 }
+
+void fir::factory::setInternalLinkage(mlir::func::FuncOp func) {
+  auto internalLinkage = mlir::LLVM::linkage::Linkage::Internal;
+  auto linkage =
+      mlir::LLVM::LinkageAttr::get(func->getContext(), internalLinkage);
+  func->setAttr("llvm.linkage", linkage);
+}
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 3a82be895d37..837ef0576ef6 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -916,6 +916,14 @@ mlir::Value genComplexMathOp(fir::FirOpBuilder &builder, mlir::Location loc,
 ///       See https://gcc.gnu.org/onlinedocs/gcc-12.1.0/gfortran/\
 ///       Intrinsic-Procedures.html for a reference.
 constexpr auto FuncTypeReal16Real16 = genFuncType<Ty::Real<16>, Ty::Real<16>>;
+constexpr auto FuncTypeReal16Real16Real16 =
+    genFuncType<Ty::Real<16>, Ty::Real<16>, Ty::Real<16>>;
+constexpr auto FuncTypeReal16Integer4Real16 =
+    genFuncType<Ty::Real<16>, Ty::Integer<4>, Ty::Real<16>>;
+constexpr auto FuncTypeInteger4Real16 =
+    genFuncType<Ty::Integer<4>, Ty::Real<16>>;
+constexpr auto FuncTypeInteger8Real16 =
+    genFuncType<Ty::Integer<8>, Ty::Real<16>>;
 constexpr auto FuncTypeReal16Complex16 =
     genFuncType<Ty::Real<16>, Ty::Complex<16>>;
 
@@ -933,10 +941,12 @@ static constexpr MathOperation mathOperations[] = {
     {"abs", RTNAME_STRING(CAbsF128), FuncTypeReal16Complex16, genLibF128Call},
     {"acos", "acosf", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"acos", "acos", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"acos", RTNAME_STRING(AcosF128), FuncTypeReal16Real16, genLibF128Call},
     {"acos", "cacosf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>, genLibCall},
     {"acos", "cacos", genFuncType<Ty::Complex<8>, Ty::Complex<8>>, genLibCall},
     {"acosh", "acoshf", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"acosh", "acosh", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"acosh", RTNAME_STRING(AcoshF128), FuncTypeReal16Real16, genLibF128Call},
     {"acosh", "cacoshf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>,
      genLibCall},
     {"acosh", "cacosh", genFuncType<Ty::Complex<8>, Ty::Complex<8>>,
@@ -948,6 +958,7 @@ static constexpr MathOperation mathOperations[] = {
      genLibCall},
     {"aint", "llvm.trunc.f80", genFuncType<Ty::Real<10>, Ty::Real<10>>,
      genLibCall},
+    {"aint", RTNAME_STRING(TruncF128), FuncTypeReal16Real16, genLibF128Call},
     // llvm.round behaves the same way as libm's round.
     {"anint", "llvm.round.f32", genFuncType<Ty::Real<4>, Ty::Real<4>>,
      genMathOp<mlir::LLVM::RoundOp>},
@@ -955,12 +966,15 @@ static constexpr MathOperation mathOperations[] = {
      genMathOp<mlir::LLVM::RoundOp>},
     {"anint", "llvm.round.f80", genFuncType<Ty::Real<10>, Ty::Real<10>>,
      genMathOp<mlir::LLVM::RoundOp>},
+    {"anint", RTNAME_STRING(RoundF128), FuncTypeReal16Real16, genLibF128Call},
     {"asin", "asinf", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"asin", "asin", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"asin", RTNAME_STRING(AsinF128), FuncTypeReal16Real16, genLibF128Call},
     {"asin", "casinf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>, genLibCall},
     {"asin", "casin", genFuncType<Ty::Complex<8>, Ty::Complex<8>>, genLibCall},
     {"asinh", "asinhf", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"asinh", "asinh", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"asinh", RTNAME_STRING(AsinhF128), FuncTypeReal16Real16, genLibF128Call},
     {"asinh", "casinhf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>,
      genLibCall},
     {"asinh", "casinh", genFuncType<Ty::Complex<8>, Ty::Complex<8>>,
@@ -969,49 +983,64 @@ static constexpr MathOperation mathOperations[] = {
      genMathOp<mlir::math::AtanOp>},
     {"atan", "atan", genFuncType<Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::AtanOp>},
+    {"atan", RTNAME_STRING(AtanF128), FuncTypeReal16Real16, genLibF128Call},
     {"atan", "catanf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>, genLibCall},
     {"atan", "catan", genFuncType<Ty::Complex<8>, Ty::Complex<8>>, genLibCall},
     {"atan2", "atan2f", genFuncType<Ty::Real<4>, Ty::Real<4>, Ty::Real<4>>,
      genMathOp<mlir::math::Atan2Op>},
     {"atan2", "atan2", genFuncType<Ty::Real<8>, Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::Atan2Op>},
+    {"atan2", RTNAME_STRING(Atan2F128), FuncTypeReal16Real16Real16,
+     genLibF128Call},
     {"atanh", "atanhf", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"atanh", "atanh", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"atanh", RTNAME_STRING(AtanhF128), FuncTypeReal16Real16, genLibF128Call},
     {"atanh", "catanhf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>,
      genLibCall},
     {"atanh", "catanh", genFuncType<Ty::Complex<8>, Ty::Complex<8>>,
      genLibCall},
     {"bessel_j0", "j0f", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"bessel_j0", "j0", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"bessel_j0", RTNAME_STRING(J0F128), FuncTypeReal16Real16, genLibF128Call},
     {"bessel_j1", "j1f", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"bessel_j1", "j1", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"bessel_j1", RTNAME_STRING(J1F128), FuncTypeReal16Real16, genLibF128Call},
     {"bessel_jn", "jnf", genFuncType<Ty::Real<4>, Ty::Integer<4>, Ty::Real<4>>,
      genLibCall},
     {"bessel_jn", "jn", genFuncType<Ty::Real<8>, Ty::Integer<4>, Ty::Real<8>>,
      genLibCall},
+    {"bessel_jn", RTNAME_STRING(JnF128), FuncTypeReal16Integer4Real16,
+     genLibF128Call},
     {"bessel_y0", "y0f", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"bessel_y0", "y0", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"bessel_y0", RTNAME_STRING(Y0F128), FuncTypeReal16Real16, genLibF128Call},
     {"bessel_y1", "y1f", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"bessel_y1", "y1", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"bessel_y1", RTNAME_STRING(Y1F128), FuncTypeReal16Real16, genLibF128Call},
     {"bessel_yn", "ynf", genFuncType<Ty::Real<4>, Ty::Integer<4>, Ty::Real<4>>,
      genLibCall},
     {"bessel_yn", "yn", genFuncType<Ty::Real<8>, Ty::Integer<4>, Ty::Real<8>>,
      genLibCall},
+    {"bessel_yn", RTNAME_STRING(YnF128), FuncTypeReal16Integer4Real16,
+     genLibF128Call},
     // math::CeilOp returns a real, while Fortran CEILING returns integer.
     {"ceil", "ceilf", genFuncType<Ty::Real<4>, Ty::Real<4>>,
      genMathOp<mlir::math::CeilOp>},
     {"ceil", "ceil", genFuncType<Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::CeilOp>},
+    {"ceil", RTNAME_STRING(CeilF128), FuncTypeReal16Real16, genLibF128Call},
     {"cos", "cosf", genFuncType<Ty::Real<4>, Ty::Real<4>>,
      genMathOp<mlir::math::CosOp>},
     {"cos", "cos", genFuncType<Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::CosOp>},
+    {"cos", RTNAME_STRING(CosF128), FuncTypeReal16Real16, genLibF128Call},
     {"cos", "ccosf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>,
      genComplexMathOp<mlir::complex::CosOp>},
     {"cos", "ccos", genFuncType<Ty::Complex<8>, Ty::Complex<8>>,
      genComplexMathOp<mlir::complex::CosOp>},
     {"cosh", "coshf", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"cosh", "cosh", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"cosh", RTNAME_STRING(CoshF128), FuncTypeReal16Real16, genLibF128Call},
     {"cosh", "ccoshf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>, genLibCall},
     {"cosh", "ccosh", genFuncType<Ty::Complex<8>, Ty::Complex<8>>, genLibCall},
     {"divc",
@@ -1038,12 +1067,15 @@ static constexpr MathOperation mathOperations[] = {
      genMathOp<mlir::math::ErfOp>},
     {"erf", "erf", genFuncType<Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::ErfOp>},
+    {"erf", RTNAME_STRING(ErfF128), FuncTypeReal16Real16, genLibF128Call},
     {"erfc", "erfcf", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"erfc", "erfc", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"erfc", RTNAME_STRING(ErfcF128), FuncTypeReal16Real16, genLibF128Call},
     {"exp", "expf", genFuncType<Ty::Real<4>, Ty::Real<4>>,
      genMathOp<mlir::math::ExpOp>},
     {"exp", "exp", genFuncType<Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::ExpOp>},
+    {"exp", RTNAME_STRING(ExpF128), FuncTypeReal16Real16, genLibF128Call},
     {"exp", "cexpf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>,
      genComplexMathOp<mlir::complex::ExpOp>},
     {"exp", "cexp", genFuncType<Ty::Complex<8>, Ty::Complex<8>>,
@@ -1074,6 +1106,7 @@ static constexpr MathOperation mathOperations[] = {
      genMathOp<mlir::math::FloorOp>},
     {"floor", "floor", genFuncType<Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::FloorOp>},
+    {"floor", RTNAME_STRING(FloorF128), FuncTypeReal16Real16, genLibF128Call},
     {"fma", "llvm.fma.f32",
      genFuncType<Ty::Real<4>, Ty::Real<4>, Ty::Real<4>, Ty::Real<4>>,
      genMathOp<mlir::math::FmaOp>},
@@ -1082,14 +1115,18 @@ static constexpr MathOperation mathOperations[] = {
      genMathOp<mlir::math::FmaOp>},
     {"gamma", "tgammaf", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"gamma", "tgamma", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"gamma", RTNAME_STRING(TgammaF128), FuncTypeReal16Real16, genLibF128Call},
     {"hypot", "hypotf", genFuncType<Ty::Real<4>, Ty::Real<4>, Ty::Real<4>>,
      genLibCall},
     {"hypot", "hypot", genFuncType<Ty::Real<8>, Ty::Real<8>, Ty::Real<8>>,
      genLibCall},
+    {"hypot", RTNAME_STRING(HypotF128), FuncTypeReal16Real16Real16,
+     genLibF128Call},
     {"log", "logf", genFuncType<Ty::Real<4>, Ty::Real<4>>,
      genMathOp<mlir::math::LogOp>},
     {"log", "log", genFuncType<Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::LogOp>},
+    {"log", RTNAME_STRING(LogF128), FuncTypeReal16Real16, genLibF128Call},
     {"log", "clogf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>,
      genComplexMathOp<mlir::complex::LogOp>},
     {"log", "clog", genFuncType<Ty::Complex<8>, Ty::Complex<8>>,
@@ -1098,17 +1135,23 @@ static constexpr MathOperation mathOperations[] = {
      genMathOp<mlir::math::Log10Op>},
     {"log10", "log10", genFuncType<Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::Log10Op>},
+    {"log10", RTNAME_STRING(Log10F128), FuncTypeReal16Real16, genLibF128Call},
     {"log_gamma", "lgammaf", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"log_gamma", "lgamma", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"log_gamma", RTNAME_STRING(LgammaF128), FuncTypeReal16Real16,
+     genLibF128Call},
     // llvm.lround behaves the same way as libm's lround.
     {"nint", "llvm.lround.i64.f64", genFuncType<Ty::Integer<8>, Ty::Real<8>>,
      genLibCall},
     {"nint", "llvm.lround.i64.f32", genFuncType<Ty::Integer<8>, Ty::Real<4>>,
      genLibCall},
+    {"nint", RTNAME_STRING(LlroundF128), FuncTypeInteger8Real16,
+     genLibF128Call},
     {"nint", "llvm.lround.i32.f64", genFuncType<Ty::Integer<4>, Ty::Real<8>>,
      genLibCall},
     {"nint", "llvm.lround.i32.f32", genFuncType<Ty::Integer<4>, Ty::Real<4>>,
      genLibCall},
+    {"nint", RTNAME_STRING(LroundF128), FuncTypeInteger4Real16, genLibF128Call},
     {"pow",
      {},
      genFuncType<Ty::Integer<1>, Ty::Integer<1>, Ty::Integer<1>>,
@@ -1129,6 +1172,7 @@ static constexpr MathOperation mathOperations[] = {
      genMathOp<mlir::math::PowFOp>},
     {"pow", "pow", genFuncType<Ty::Real<8>, Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::PowFOp>},
+    {"pow", RTNAME_STRING(PowF128), FuncTypeReal16Real16Real16, genLibF128Call},
     {"pow", "cpowf",
      genFuncType<Ty::Complex<4>, Ty::Complex<4>, Ty::Complex<4>>,
      genComplexMathOp<mlir::complex::PowOp>},
@@ -1140,12 +1184,18 @@ static constexpr MathOperation mathOperations[] = {
     {"pow", RTNAME_STRING(FPow8i),
      genFuncType<Ty::Real<8>, Ty::Real<8>, Ty::Integer<4>>,
      genMathOp<mlir::math::FPowIOp>},
+    {"pow", RTNAME_STRING(FPow16i),
+     genFuncType<Ty::Real<16>, Ty::Real<16>, Ty::Integer<4>>,
+     genMathOp<mlir::math::FPowIOp>},
     {"pow", RTNAME_STRING(FPow4k),
      genFuncType<Ty::Real<4>, Ty::Real<4>, Ty::Integer<8>>,
      genMathOp<mlir::math::FPowIOp>},
     {"pow", RTNAME_STRING(FPow8k),
      genFuncType<Ty::Real<8>, Ty::Real<8>, Ty::Integer<8>>,
      genMathOp<mlir::math::FPowIOp>},
+    {"pow", RTNAME_STRING(FPow16k),
+     genFuncType<Ty::Real<16>, Ty::Real<16>, Ty::Integer<8>>,
+     genMathOp<mlir::math::FPowIOp>},
     {"pow", RTNAME_STRING(cpowi),
      genFuncType<Ty::Complex<4>, Ty::Complex<4>, Ty::Integer<4>>, genLibCall},
     {"pow", RTNAME_STRING(zpowi),
@@ -1174,6 +1224,7 @@ static constexpr MathOperation mathOperations[] = {
      genComplexMathOp<mlir::complex::SinOp>},
     {"sinh", "sinhf", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"sinh", "sinh", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"sinh", RTNAME_STRING(SinhF128), FuncTypeReal16Real16, genLibF128Call},
     {"sinh", "csinhf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>, genLibCall},
     {"sinh", "csinh", genFuncType<Ty::Complex<8>, Ty::Complex<8>>, genLibCall},
     {"sqrt", "sqrtf", genFuncType<Ty::Real<4>, Ty::Real<4>>,
@@ -1189,6 +1240,7 @@ static constexpr MathOperation mathOperations[] = {
      genMathOp<mlir::math::TanOp>},
     {"tan", "tan", genFuncType<Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::TanOp>},
+    {"tan", RTNAME_STRING(TanF128), FuncTypeReal16Real16, genLibF128Call},
     {"tan", "ctanf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>,
      genComplexMathOp<mlir::complex::TanOp>},
     {"tan", "ctan", genFuncType<Ty::Complex<8>, Ty::Complex<8>>,
@@ -1197,6 +1249,7 @@ static constexpr MathOperation mathOperations[] = {
      genMathOp<mlir::math::TanhOp>},
     {"tanh", "tanh", genFuncType<Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::TanhOp>},
+    {"tanh", RTNAME_STRING(TanhF128), FuncTypeReal16Real16, genLibF128Call},
     {"tanh", "ctanhf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>,
      genComplexMathOp<mlir::complex::TanhOp>},
     {"tanh", "ctanh", genFuncType<Ty::Complex<8>, Ty::Complex<8>>,
@@ -1781,10 +1834,7 @@ mlir::func::FuncOp IntrinsicLibrary::getWrapper(GeneratorType generator,
     // First time this wrapper is needed, build it.
     function = builder.createFunction(loc, wrapperName, funcType);
     function->setAttr("fir.intrinsic", builder.getUnitAttr());
-    auto internalLinkage = mlir::LLVM::linkage::Linkage::Internal;
-    auto linkage =
-        mlir::LLVM::LinkageAttr::get(builder.getContext(), internalLinkage);
-    function->setAttr("llvm.linkage", linkage);
+    fir::factory::setInternalLinkage(function);
     function.addEntryBlock();
 
     // Create local context to emit code into the newly created function
diff --git a/flang/lib/Optimizer/Builder/MutableBox.cpp b/flang/lib/Optimizer/Builder/MutableBox.cpp
index 4d8860b60915..d4012e9c3d9d 100644
--- a/flang/lib/Optimizer/Builder/MutableBox.cpp
+++ b/flang/lib/Optimizer/Builder/MutableBox.cpp
@@ -674,7 +674,7 @@ void fir::factory::disassociateMutableBox(fir::FirOpBuilder &builder,
     // 7.3.2.3 point 7. The dynamic type of a disassociated pointer is the
     // same as its declared type.
     auto boxTy = box.getBoxTy().dyn_cast<fir::BaseBoxType>();
-    auto eleTy = fir::dyn_cast_ptrOrBoxEleTy(boxTy.getEleTy());
+    auto eleTy = fir::unwrapPassByRefType(boxTy.getEleTy());
     mlir::Type derivedType = fir::getDerivedType(eleTy);
     if (auto recTy = derivedType.dyn_cast<fir::RecordType>()) {
       fir::runtime::genNullifyDerivedType(builder, loc, box.getAddr(), recTy,
diff --git a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
index fabbff818b6f..66fbaddcbda1 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
@@ -149,6 +149,22 @@ struct ForcedNorm2Real16 {
   }
 };
 
+/// Placeholder for real*16 version of Norm2Dim Intrinsic
+struct ForcedNorm2DimReal16 {
+  static constexpr const char *name = ExpandAndQuoteKey(RTNAME(Norm2DimReal16));
+  static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
+    return [](mlir::MLIRContext *ctx) {
+      auto boxTy =
+          fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
+      auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
+      auto intTy = mlir::IntegerType::get(ctx, 8 * sizeof(int));
+      return mlir::FunctionType::get(
+          ctx, {fir::ReferenceType::get(boxTy), boxTy, intTy, strTy, intTy},
+          mlir::NoneType::get(ctx));
+    };
+  }
+};
+
 /// Placeholder for real*10 version of Product Intrinsic
 struct ForcedProductReal10 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(ProductReal10));
@@ -876,7 +892,14 @@ mlir::Value fir::runtime::genMinval(fir::FirOpBuilder &builder,
 void fir::runtime::genNorm2Dim(fir::FirOpBuilder &builder, mlir::Location loc,
                                mlir::Value resultBox, mlir::Value arrayBox,
                                mlir::Value dim) {
-  auto func = fir::runtime::getRuntimeFunc<mkRTKey(Norm2Dim)>(loc, builder);
+  mlir::func::FuncOp func;
+  auto ty = arrayBox.getType();
+  auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
+  auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+  if (eleTy.isF128())
+    func = fir::runtime::getRuntimeFunc<ForcedNorm2DimReal16>(loc, builder);
+  else
+    func = fir::runtime::getRuntimeFunc<mkRTKey(Norm2Dim)>(loc, builder);
   auto fTy = func.getFunctionType();
   auto sourceFile = fir::factory::locationToFilename(builder, loc);
   auto sourceLine =
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 0a534cdb3c48..9bb10a42a399 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -3866,6 +3866,103 @@ mlir::LogicalResult fir::DeclareOp::verify() {
   return fortranVar.verifyDeclareLikeOpImpl(getMemref());
 }
 
+llvm::SmallVector<mlir::Region *> fir::CUDAKernelOp::getLoopRegions() {
+  return {&getRegion()};
+}
+
+mlir::ParseResult parseCUFKernelValues(
+    mlir::OpAsmParser &parser,
+    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &values,
+    llvm::SmallVectorImpl<mlir::Type> &types) {
+  if (mlir::succeeded(parser.parseOptionalStar()))
+    return mlir::success();
+
+  if (parser.parseOptionalLParen()) {
+    if (mlir::failed(parser.parseCommaSeparatedList(
+            mlir::AsmParser::Delimiter::None, [&]() {
+              if (parser.parseOperand(values.emplace_back()))
+                return mlir::failure();
+              return mlir::success();
+            })))
+      return mlir::failure();
+    if (parser.parseRParen())
+      return mlir::failure();
+  } else {
+    if (parser.parseOperand(values.emplace_back()))
+      return mlir::failure();
+    return mlir::success();
+  }
+  return mlir::success();
+}
+
+void printCUFKernelValues(mlir::OpAsmPrinter &p, mlir::Operation *op,
+                          mlir::ValueRange values, mlir::TypeRange types) {
+  if (values.empty())
+    p << "*";
+
+  if (values.size() > 1)
+    p << "(";
+  llvm::interleaveComma(values, p, [&p](mlir::Value v) { p << v; });
+  if (values.size() > 1)
+    p << ")";
+}
+
+mlir::ParseResult parseCUFKernelLoopControl(
+    mlir::OpAsmParser &parser, mlir::Region &region,
+    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &lowerbound,
+    llvm::SmallVectorImpl<mlir::Type> &lowerboundType,
+    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &upperbound,
+    llvm::SmallVectorImpl<mlir::Type> &upperboundType,
+    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &step,
+    llvm::SmallVectorImpl<mlir::Type> &stepType) {
+
+  llvm::SmallVector<mlir::OpAsmParser::Argument> inductionVars;
+  if (parser.parseLParen() ||
+      parser.parseArgumentList(inductionVars,
+                               mlir::OpAsmParser::Delimiter::None,
+                               /*allowType=*/true) ||
+      parser.parseRParen() || parser.parseEqual() || parser.parseLParen() ||
+      parser.parseOperandList(lowerbound, inductionVars.size(),
+                              mlir::OpAsmParser::Delimiter::None) ||
+      parser.parseColonTypeList(lowerboundType) || parser.parseRParen() ||
+      parser.parseKeyword("to") || parser.parseLParen() ||
+      parser.parseOperandList(upperbound, inductionVars.size(),
+                              mlir::OpAsmParser::Delimiter::None) ||
+      parser.parseColonTypeList(upperboundType) || parser.parseRParen() ||
+      parser.parseKeyword("step") || parser.parseLParen() ||
+      parser.parseOperandList(step, inductionVars.size(),
+                              mlir::OpAsmParser::Delimiter::None) ||
+      parser.parseColonTypeList(stepType) || parser.parseRParen())
+    return mlir::failure();
+  return parser.parseRegion(region, inductionVars);
+}
+
+void printCUFKernelLoopControl(
+    mlir::OpAsmPrinter &p, mlir::Operation *op, mlir::Region &region,
+    mlir::ValueRange lowerbound, mlir::TypeRange lowerboundType,
+    mlir::ValueRange upperbound, mlir::TypeRange upperboundType,
+    mlir::ValueRange steps, mlir::TypeRange stepType) {
+  mlir::ValueRange regionArgs = region.front().getArguments();
+  if (!regionArgs.empty()) {
+    p << "(";
+    llvm::interleaveComma(
+        regionArgs, p, [&p](mlir::Value v) { p << v << " : " << v.getType(); });
+    p << ") = (" << lowerbound << " : " << lowerboundType << ") to ("
+      << upperbound << " : " << upperboundType << ") "
+      << " step (" << steps << " : " << stepType << ") ";
+  }
+  p.printRegion(region, /*printEntryBlockArgs=*/false);
+}
+
+mlir::LogicalResult fir::CUDAKernelOp::verify() {
+  if (getLowerbound().size() != getUpperbound().size() ||
+      getLowerbound().size() != getStep().size())
+    return emitOpError(
+        "expect same number of values in lowerbound, upperbound and step");
+
+  return mlir::success();
+}
+
 //===----------------------------------------------------------------------===//
 // FIROpsDialect
 //===----------------------------------------------------------------------===//
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
index 314e4264c17e..377cc4439202 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
@@ -176,7 +176,14 @@ protected:
           rewriter.eraseOp(use);
       }
     }
-    rewriter.replaceAllUsesWith(op->getResults(), {base});
+    // TODO: This entire pass should be a greedy pattern rewrite or a manual
+    // IR traversal. A dialect conversion cannot be used here because
+    // `replaceAllUsesWith` is not supported. Similarly, `replaceOp` is not
+    // suitable because "op->getResult(0)" and "base" can have different types.
+    // In such a case, the dialect conversion will attempt to convert the type,
+    // but no type converter is specified in this pass. Also note that all
+    // patterns in this pass are actually rewrite patterns.
+    op->getResult(0).replaceAllUsesWith(base);
     rewriter.replaceOp(op, base);
   }
 };
diff --git a/flang/runtime/Float128Math/CMakeLists.txt b/flang/runtime/Float128Math/CMakeLists.txt
index f8da4d7ca1a9..f11678cd70b7 100644
--- a/flang/runtime/Float128Math/CMakeLists.txt
+++ b/flang/runtime/Float128Math/CMakeLists.txt
@@ -14,8 +14,9 @@
 # will have a dependency on the third-party library that is being
 # used for building this FortranFloat128Math library.
 
-if (${FLANG_RUNTIME_F128_MATH_LIB} STREQUAL "libquadmath" OR
-    ${FLANG_RUNTIME_F128_MATH_LIB} STREQUAL "quadmath")
+include(CheckLibraryExists)
+
+if (${FLANG_RUNTIME_F128_MATH_LIB} STREQUAL "libquadmath")
   check_include_file(quadmath.h FOUND_QUADMATH_HEADER)
   if(FOUND_QUADMATH_HEADER)
     add_compile_definitions(HAS_QUADMATHLIB)
@@ -25,7 +26,18 @@ if (${FLANG_RUNTIME_F128_MATH_LIB} STREQUAL "libquadmath" OR
       "to be available: ${FLANG_RUNTIME_F128_MATH_LIB}"
       )
   endif()
-else()
+elseif (${FLANG_RUNTIME_F128_MATH_LIB} STREQUAL "libm")
+  check_library_exists(m sinl "" FOUND_LIBM)
+  check_library_exists(m sinf128 "" FOUND_LIBMF128)
+  if (FOUND_LIBM)
+    add_compile_definitions(HAS_LIBM)
+  endif()
+  if (FOUND_LIBMF128)
+    add_compile_definitions(HAS_LIBMF128)
+  endif()
+endif()
+
+if (NOT FOUND_QUADMATH_HEADER AND NOT FOUND_LIBM)
   message(FATAL_ERROR
     "Unsupported third-party library for Fortran F128 math runtime: "
     "${FLANG_RUNTIME_F128_MATH_LIB}"
@@ -33,9 +45,43 @@ else()
 endif()
 
 set(sources
+  acos.cpp
+  acosh.cpp
+  asin.cpp
+  asinh.cpp
+  atan.cpp
+  atan2.cpp
+  atanh.cpp
   cabs.cpp
+  ceil.cpp
+  cos.cpp
+  cosh.cpp
+  erf.cpp
+  erfc.cpp
+  exp.cpp
+  floor.cpp
+  hypot.cpp
+  j0.cpp
+  j1.cpp
+  jn.cpp
+  lgamma.cpp
+  llround.cpp
+  log.cpp
+  log10.cpp
+  lround.cpp
+  norm2.cpp
+  pow.cpp
+  round.cpp
   sin.cpp
+  sinh.cpp
   sqrt.cpp
+  tan.cpp
+  tanh.cpp
+  tgamma.cpp
+  trunc.cpp
+  y0.cpp
+  y1.cpp
+  yn.cpp
   )
 
 include_directories(AFTER "${CMAKE_CURRENT_SOURCE_DIR}/..")
diff --git a/flang/runtime/Float128Math/acos.cpp b/flang/runtime/Float128Math/acos.cpp
new file mode 100644
index 000000000000..531c79c7444b
--- /dev/null
+++ b/flang/runtime/Float128Math/acos.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/acos.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(AcosF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Acos<RTNAME(AcosF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/acosh.cpp b/flang/runtime/Float128Math/acosh.cpp
new file mode 100644
index 000000000000..1495120edd1a
--- /dev/null
+++ b/flang/runtime/Float128Math/acosh.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/acosh.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(AcoshF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Acosh<RTNAME(AcoshF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/asin.cpp b/flang/runtime/Float128Math/asin.cpp
new file mode 100644
index 000000000000..2fb8c6c5e97d
--- /dev/null
+++ b/flang/runtime/Float128Math/asin.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/asin.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(AsinF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Asin<RTNAME(AsinF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/asinh.cpp b/flang/runtime/Float128Math/asinh.cpp
new file mode 100644
index 000000000000..3630a77be42b
--- /dev/null
+++ b/flang/runtime/Float128Math/asinh.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/asinh.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(AsinhF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Asinh<RTNAME(AsinhF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/atan.cpp b/flang/runtime/Float128Math/atan.cpp
new file mode 100644
index 000000000000..4609343e9d12
--- /dev/null
+++ b/flang/runtime/Float128Math/atan.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/atan.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(AtanF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Atan<RTNAME(AtanF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/atan2.cpp b/flang/runtime/Float128Math/atan2.cpp
new file mode 100644
index 000000000000..c0175e67ec71
--- /dev/null
+++ b/flang/runtime/Float128Math/atan2.cpp
@@ -0,0 +1,23 @@
+//===-- runtime/Float128Math/atan2.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(Atan2F128)(
+    CppTypeFor<TypeCategory::Real, 16> x,
+    CppTypeFor<TypeCategory::Real, 16> y) {
+  return Atan2<RTNAME(Atan2F128)>::invoke(x, y);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/atanh.cpp b/flang/runtime/Float128Math/atanh.cpp
new file mode 100644
index 000000000000..bfacb967117d
--- /dev/null
+++ b/flang/runtime/Float128Math/atanh.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/atanh.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(AtanhF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Atanh<RTNAME(AtanhF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/cabs.cpp b/flang/runtime/Float128Math/cabs.cpp
index 63f2bdf8e177..827b197a6a81 100644
--- a/flang/runtime/Float128Math/cabs.cpp
+++ b/flang/runtime/Float128Math/cabs.cpp
@@ -10,15 +10,15 @@
 
 namespace Fortran::runtime {
 extern "C" {
-
+#if 0
+// FIXME: temporarily disabled. Need to add pure C entry point
+// using C _Complex ABI.
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-// FIXME: the argument should be CppTypeFor<TypeCategory::Complex, 16>,
-// and it should be translated into the underlying library's
-// corresponding complex128 type.
-CppTypeFor<TypeCategory::Real, 16> RTDEF(CAbsF128)(ComplexF128 x) {
+// NOTE: Flang calls the runtime APIs using C _Complex ABI
+CppTypeFor<TypeCategory::Real, 16> RTDEF(CAbsF128)(CFloat128ComplexType x) {
   return CAbs<RTNAME(CAbsF128)>::invoke(x);
 }
 #endif
-
+#endif
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/ceil.cpp b/flang/runtime/Float128Math/ceil.cpp
new file mode 100644
index 000000000000..a53a2c27c616
--- /dev/null
+++ b/flang/runtime/Float128Math/ceil.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/ceil.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(CeilF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Ceil<RTNAME(CeilF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/cos.cpp b/flang/runtime/Float128Math/cos.cpp
new file mode 100644
index 000000000000..845c970bd8e6
--- /dev/null
+++ b/flang/runtime/Float128Math/cos.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/cos.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(CosF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Cos<RTNAME(CosF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/cosh.cpp b/flang/runtime/Float128Math/cosh.cpp
new file mode 100644
index 000000000000..acf6ff4130ee
--- /dev/null
+++ b/flang/runtime/Float128Math/cosh.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/cosh.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(CoshF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Cosh<RTNAME(CoshF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/erf.cpp b/flang/runtime/Float128Math/erf.cpp
new file mode 100644
index 000000000000..862f3b974118
--- /dev/null
+++ b/flang/runtime/Float128Math/erf.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/erf.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(ErfF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Erf<RTNAME(ErfF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/erfc.cpp b/flang/runtime/Float128Math/erfc.cpp
new file mode 100644
index 000000000000..0ac0b9455637
--- /dev/null
+++ b/flang/runtime/Float128Math/erfc.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/erfc.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(ErfcF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Erfc<RTNAME(ErfcF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/exp.cpp b/flang/runtime/Float128Math/exp.cpp
new file mode 100644
index 000000000000..50386fdbfb64
--- /dev/null
+++ b/flang/runtime/Float128Math/exp.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/exp.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(ExpF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Exp<RTNAME(ExpF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/floor.cpp b/flang/runtime/Float128Math/floor.cpp
new file mode 100644
index 000000000000..48cf4e014480
--- /dev/null
+++ b/flang/runtime/Float128Math/floor.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/floor.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(FloorF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Floor<RTNAME(FloorF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/hypot.cpp b/flang/runtime/Float128Math/hypot.cpp
new file mode 100644
index 000000000000..33c83a165499
--- /dev/null
+++ b/flang/runtime/Float128Math/hypot.cpp
@@ -0,0 +1,23 @@
+//===-- runtime/Float128Math/hypot.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(HypotF128)(
+    CppTypeFor<TypeCategory::Real, 16> x,
+    CppTypeFor<TypeCategory::Real, 16> y) {
+  return Hypot<RTNAME(HypotF128)>::invoke(x, y);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/j0.cpp b/flang/runtime/Float128Math/j0.cpp
new file mode 100644
index 000000000000..f8f3fe71d8a6
--- /dev/null
+++ b/flang/runtime/Float128Math/j0.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/j0.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(J0F128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return J0<RTNAME(J0F128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/j1.cpp b/flang/runtime/Float128Math/j1.cpp
new file mode 100644
index 000000000000..9a51b973e1cf
--- /dev/null
+++ b/flang/runtime/Float128Math/j1.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/j1.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(J1F128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return J1<RTNAME(J1F128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/jn.cpp b/flang/runtime/Float128Math/jn.cpp
new file mode 100644
index 000000000000..644a66863c0d
--- /dev/null
+++ b/flang/runtime/Float128Math/jn.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/jn.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(JnF128)(
+    int n, CppTypeFor<TypeCategory::Real, 16> x) {
+  return Jn<RTNAME(JnF128)>::invoke(n, x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/lgamma.cpp b/flang/runtime/Float128Math/lgamma.cpp
new file mode 100644
index 000000000000..fff7dfcb9c15
--- /dev/null
+++ b/flang/runtime/Float128Math/lgamma.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/lgamma.cpp -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(LgammaF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Lgamma<RTNAME(LgammaF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/llround.cpp b/flang/runtime/Float128Math/llround.cpp
new file mode 100644
index 000000000000..00c62818af19
--- /dev/null
+++ b/flang/runtime/Float128Math/llround.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/llround.cpp ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(LlroundF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Llround<RTNAME(LlroundF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/log.cpp b/flang/runtime/Float128Math/log.cpp
new file mode 100644
index 000000000000..0cfe329c6f7f
--- /dev/null
+++ b/flang/runtime/Float128Math/log.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/log.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(LogF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Log<RTNAME(LogF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/log10.cpp b/flang/runtime/Float128Math/log10.cpp
new file mode 100644
index 000000000000..cd8bf27fcb12
--- /dev/null
+++ b/flang/runtime/Float128Math/log10.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/log10.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(Log10F128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Log10<RTNAME(Log10F128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/lround.cpp b/flang/runtime/Float128Math/lround.cpp
new file mode 100644
index 000000000000..6ced66a1b2d3
--- /dev/null
+++ b/flang/runtime/Float128Math/lround.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/lround.cpp -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(LroundF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Lround<RTNAME(LroundF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/math-entries.h b/flang/runtime/Float128Math/math-entries.h
index 91c14b008b57..a0d81d0cbb54 100644
--- a/flang/runtime/Float128Math/math-entries.h
+++ b/flang/runtime/Float128Math/math-entries.h
@@ -12,6 +12,7 @@
 #include "tools.h"
 #include "flang/Common/float128.h"
 #include "flang/Runtime/entry-names.h"
+#include <cfloat>
 #include <type_traits>
 
 namespace Fortran::runtime {
@@ -42,7 +43,8 @@ namespace Fortran::runtime {
 #define DEFINE_SIMPLE_ALIAS(caller, callee) \
   template <typename RT, typename... ATs, RT (*p)(ATs...)> struct caller<p> { \
     static RT invoke(ATs... args) { \
-      static_assert(std::is_invocable_r_v<RT, decltype(callee), ATs...>); \
+      static_assert(std::is_invocable_r_v<RT, \
+          decltype(callee(std::declval<ATs>()...))(ATs...), ATs...>); \
       if constexpr (std::is_same_v<RT, void>) { \
         callee(args...); \
       } else { \
@@ -52,26 +54,159 @@ namespace Fortran::runtime {
   };
 
 // Define fallback callers.
-DEFINE_FALLBACK(CAbs)
+DEFINE_FALLBACK(Abs)
+DEFINE_FALLBACK(Acos)
+DEFINE_FALLBACK(Acosh)
+DEFINE_FALLBACK(Asin)
+DEFINE_FALLBACK(Asinh)
+DEFINE_FALLBACK(Atan)
+DEFINE_FALLBACK(Atan2)
+DEFINE_FALLBACK(Atanh)
+DEFINE_FALLBACK(Ceil)
+DEFINE_FALLBACK(Cos)
+DEFINE_FALLBACK(Cosh)
+DEFINE_FALLBACK(Erf)
+DEFINE_FALLBACK(Erfc)
+DEFINE_FALLBACK(Exp)
+DEFINE_FALLBACK(Floor)
+DEFINE_FALLBACK(Hypot)
+DEFINE_FALLBACK(J0)
+DEFINE_FALLBACK(J1)
+DEFINE_FALLBACK(Jn)
+DEFINE_FALLBACK(Lgamma)
+DEFINE_FALLBACK(Llround)
+DEFINE_FALLBACK(Lround)
+DEFINE_FALLBACK(Log)
+DEFINE_FALLBACK(Log10)
+DEFINE_FALLBACK(Pow)
+DEFINE_FALLBACK(Round)
 DEFINE_FALLBACK(Sin)
+DEFINE_FALLBACK(Sinh)
 DEFINE_FALLBACK(Sqrt)
+DEFINE_FALLBACK(Tan)
+DEFINE_FALLBACK(Tanh)
+DEFINE_FALLBACK(Tgamma)
+DEFINE_FALLBACK(Trunc)
+DEFINE_FALLBACK(Y0)
+DEFINE_FALLBACK(Y1)
+DEFINE_FALLBACK(Yn)
 
-// Define ComplexF128 type that is compatible with
-// the type of results/arguments of libquadmath.
-// TODO: this may need more work for other libraries/compilers.
-#if !defined(_ARCH_PPC) || defined(__LONG_DOUBLE_IEEE128__)
-typedef _Complex float __attribute__((mode(TC))) ComplexF128;
-#else
-typedef _Complex float __attribute__((mode(KC))) ComplexF128;
+#if HAS_LIBM
+// Define wrapper callers for libm.
+#include <ccomplex>
+#include <cmath>
+
+#if LDBL_MANT_DIG == 113
+// Use STD math functions. They provide IEEE-754 128-bit float
+// support either via 'long double' or __float128.
+// The Bessel's functions are not present in STD namespace.
+DEFINE_SIMPLE_ALIAS(Abs, std::abs)
+DEFINE_SIMPLE_ALIAS(Acos, std::acos)
+DEFINE_SIMPLE_ALIAS(Acosh, std::acosh)
+DEFINE_SIMPLE_ALIAS(Asin, std::asin)
+DEFINE_SIMPLE_ALIAS(Asinh, std::asinh)
+DEFINE_SIMPLE_ALIAS(Atan, std::atan)
+DEFINE_SIMPLE_ALIAS(Atan2, std::atan2)
+DEFINE_SIMPLE_ALIAS(Atanh, std::atanh)
+// TODO: enable complex abs, when ABI adjustment for complex
+// data type is resolved.
+// DEFINE_SIMPLE_ALIAS(CAbs, std::abs)
+DEFINE_SIMPLE_ALIAS(Ceil, std::ceil)
+DEFINE_SIMPLE_ALIAS(Cos, std::cos)
+DEFINE_SIMPLE_ALIAS(Cosh, std::cosh)
+DEFINE_SIMPLE_ALIAS(Erf, std::erf)
+DEFINE_SIMPLE_ALIAS(Erfc, std::erfc)
+DEFINE_SIMPLE_ALIAS(Exp, std::exp)
+DEFINE_SIMPLE_ALIAS(Floor, std::floor)
+DEFINE_SIMPLE_ALIAS(Hypot, std::hypot)
+DEFINE_SIMPLE_ALIAS(J0, j0l)
+DEFINE_SIMPLE_ALIAS(J1, j1l)
+DEFINE_SIMPLE_ALIAS(Jn, jnl)
+DEFINE_SIMPLE_ALIAS(Lgamma, std::lgamma)
+DEFINE_SIMPLE_ALIAS(Llround, std::llround)
+DEFINE_SIMPLE_ALIAS(Lround, std::lround)
+DEFINE_SIMPLE_ALIAS(Log, std::log)
+DEFINE_SIMPLE_ALIAS(Log10, std::log10)
+DEFINE_SIMPLE_ALIAS(Pow, std::pow)
+DEFINE_SIMPLE_ALIAS(Round, std::round)
+DEFINE_SIMPLE_ALIAS(Sin, std::sin)
+DEFINE_SIMPLE_ALIAS(Sinh, std::sinh)
+DEFINE_SIMPLE_ALIAS(Sqrt, std::sqrt)
+DEFINE_SIMPLE_ALIAS(Tan, std::tan)
+DEFINE_SIMPLE_ALIAS(Tanh, std::tanh)
+DEFINE_SIMPLE_ALIAS(Tgamma, std::tgamma)
+DEFINE_SIMPLE_ALIAS(Trunc, std::trunc)
+DEFINE_SIMPLE_ALIAS(Y0, y0l)
+DEFINE_SIMPLE_ALIAS(Y1, y1l)
+DEFINE_SIMPLE_ALIAS(Yn, ynl)
+#else // LDBL_MANT_DIG != 113
+#if !HAS_LIBMF128
+// glibc >=2.26 seems to have complete support for __float128
+// versions of the math functions.
+#error "FLANG_RUNTIME_F128_MATH_LIB=libm build requires libm >=2.26"
 #endif
 
-#if HAS_QUADMATHLIB
+// We can use __float128 versions of libm functions.
+// __STDC_WANT_IEC_60559_TYPES_EXT__ needs to be defined
+// before including cmath to enable the *f128 prototypes.
+// TODO: this needs to be enabled separately, especially
+// for complex data types that require C++ complex to C complex
+// adjustment to match the ABIs.
+#error "Unsupported FLANG_RUNTIME_F128_MATH_LIB=libm build"
+#endif // LDBL_MANT_DIG != 113
+#elif HAS_QUADMATHLIB
 // Define wrapper callers for libquadmath.
 #include "quadmath.h"
-DEFINE_SIMPLE_ALIAS(CAbs, cabsq)
+DEFINE_SIMPLE_ALIAS(Abs, fabsq)
+DEFINE_SIMPLE_ALIAS(Acos, acosq)
+DEFINE_SIMPLE_ALIAS(Acosh, acoshq)
+DEFINE_SIMPLE_ALIAS(Asin, asinq)
+DEFINE_SIMPLE_ALIAS(Asinh, asinhq)
+DEFINE_SIMPLE_ALIAS(Atan, atanq)
+DEFINE_SIMPLE_ALIAS(Atan2, atan2q)
+DEFINE_SIMPLE_ALIAS(Atanh, atanhq)
+DEFINE_SIMPLE_ALIAS(Ceil, ceilq)
+DEFINE_SIMPLE_ALIAS(Cos, cosq)
+DEFINE_SIMPLE_ALIAS(Cosh, coshq)
+DEFINE_SIMPLE_ALIAS(Erf, erfq)
+DEFINE_SIMPLE_ALIAS(Erfc, erfcq)
+DEFINE_SIMPLE_ALIAS(Exp, expq)
+DEFINE_SIMPLE_ALIAS(Floor, floorq)
+DEFINE_SIMPLE_ALIAS(Hypot, hypotq)
+DEFINE_SIMPLE_ALIAS(J0, j0q)
+DEFINE_SIMPLE_ALIAS(J1, j1q)
+DEFINE_SIMPLE_ALIAS(Jn, jnq)
+DEFINE_SIMPLE_ALIAS(Lgamma, lgammaq)
+DEFINE_SIMPLE_ALIAS(Llround, llroundq)
+DEFINE_SIMPLE_ALIAS(Lround, lroundq)
+DEFINE_SIMPLE_ALIAS(Log, logq)
+DEFINE_SIMPLE_ALIAS(Log10, log10q)
+DEFINE_SIMPLE_ALIAS(Pow, powq)
+DEFINE_SIMPLE_ALIAS(Round, roundq)
 DEFINE_SIMPLE_ALIAS(Sin, sinq)
+DEFINE_SIMPLE_ALIAS(Sinh, sinhq)
 DEFINE_SIMPLE_ALIAS(Sqrt, sqrtq)
+DEFINE_SIMPLE_ALIAS(Tan, tanq)
+DEFINE_SIMPLE_ALIAS(Tanh, tanhq)
+DEFINE_SIMPLE_ALIAS(Tgamma, tgammaq)
+DEFINE_SIMPLE_ALIAS(Trunc, truncq)
+DEFINE_SIMPLE_ALIAS(Y0, y0q)
+DEFINE_SIMPLE_ALIAS(Y1, y1q)
+DEFINE_SIMPLE_ALIAS(Yn, ynq)
 #endif
+
+extern "C" {
+// Declarations of the entry points that might be referenced
+// within the Float128Math library itself.
+// Note that not all of these entry points are actually
+// defined in this library. Some of them are used just
+// as template parameters to call the corresponding callee directly.
+CppTypeFor<TypeCategory::Real, 16> RTDECL(AbsF128)(
+    CppTypeFor<TypeCategory::Real, 16> x);
+CppTypeFor<TypeCategory::Real, 16> RTDECL(SqrtF128)(
+    CppTypeFor<TypeCategory::Real, 16> x);
+} // extern "C"
+
 } // namespace Fortran::runtime
 
 #endif // FORTRAN_RUNTIME_FLOAT128MATH_MATH_ENTRIES_H_
diff --git a/flang/runtime/Float128Math/norm2.cpp b/flang/runtime/Float128Math/norm2.cpp
new file mode 100644
index 000000000000..17453bd2d6cb
--- /dev/null
+++ b/flang/runtime/Float128Math/norm2.cpp
@@ -0,0 +1,59 @@
+//===-- runtime/Float128Math/norm2.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+#include "reduction-templates.h"
+#include <cmath>
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+
+namespace {
+using namespace Fortran::runtime;
+
+using AccumType = Norm2AccumType<16>;
+
+struct ABSTy {
+  static AccumType compute(AccumType x) {
+    return Sqrt<RTNAME(AbsF128)>::invoke(x);
+  }
+};
+
+struct SQRTTy {
+  static AccumType compute(AccumType x) {
+    return Sqrt<RTNAME(SqrtF128)>::invoke(x);
+  }
+};
+
+using Float128Norm2Accumulator = Norm2Accumulator<16, ABSTy, SQRTTy>;
+} // namespace
+
+namespace Fortran::runtime {
+extern "C" {
+
+CppTypeFor<TypeCategory::Real, 16> RTDEF(Norm2_16)(
+    const Descriptor &x, const char *source, int line, int dim) {
+  auto accumulator{::Float128Norm2Accumulator(x)};
+  return GetTotalReduction<TypeCategory::Real, 16>(
+      x, source, line, dim, nullptr, accumulator, "NORM2");
+}
+
+void RTDEF(Norm2DimReal16)(Descriptor &result, const Descriptor &x, int dim,
+    const char *source, int line) {
+  Terminator terminator{source, line};
+  auto type{x.type().GetCategoryAndKind()};
+  RUNTIME_CHECK(terminator, type);
+  RUNTIME_CHECK(
+      terminator, type->first == TypeCategory::Real && type->second == 16);
+  DoMaxMinNorm2<TypeCategory::Real, 16, ::Float128Norm2Accumulator>(
+      result, x, dim, nullptr, "NORM2", terminator);
+}
+
+} // extern "C"
+} // namespace Fortran::runtime
+
+#endif
diff --git a/flang/runtime/Float128Math/pow.cpp b/flang/runtime/Float128Math/pow.cpp
new file mode 100644
index 000000000000..02958a890e52
--- /dev/null
+++ b/flang/runtime/Float128Math/pow.cpp
@@ -0,0 +1,23 @@
+//===-- runtime/Float128Math/pow.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(PowF128)(
+    CppTypeFor<TypeCategory::Real, 16> x,
+    CppTypeFor<TypeCategory::Real, 16> y) {
+  return Pow<RTNAME(PowF128)>::invoke(x, y);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/round.cpp b/flang/runtime/Float128Math/round.cpp
new file mode 100644
index 000000000000..43ab57768cb7
--- /dev/null
+++ b/flang/runtime/Float128Math/round.cpp
@@ -0,0 +1,26 @@
+//===-- runtime/Float128Math/round.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Round to nearest integer, away from zero.
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(RoundF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Round<RTNAME(RoundF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/sinh.cpp b/flang/runtime/Float128Math/sinh.cpp
new file mode 100644
index 000000000000..9c907041fd7e
--- /dev/null
+++ b/flang/runtime/Float128Math/sinh.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/sinh.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(SinhF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Sinh<RTNAME(SinhF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/tan.cpp b/flang/runtime/Float128Math/tan.cpp
new file mode 100644
index 000000000000..01d3c7bdd2e8
--- /dev/null
+++ b/flang/runtime/Float128Math/tan.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/tan.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(TanF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Tan<RTNAME(TanF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/tanh.cpp b/flang/runtime/Float128Math/tanh.cpp
new file mode 100644
index 000000000000..fedc1a4120ca
--- /dev/null
+++ b/flang/runtime/Float128Math/tanh.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/tanh.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(TanhF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Tanh<RTNAME(TanhF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/tgamma.cpp b/flang/runtime/Float128Math/tgamma.cpp
new file mode 100644
index 000000000000..329defff38cf
--- /dev/null
+++ b/flang/runtime/Float128Math/tgamma.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/tgamma.cpp -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(TgammaF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Tgamma<RTNAME(TgammaF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/trunc.cpp b/flang/runtime/Float128Math/trunc.cpp
new file mode 100644
index 000000000000..3cab219ce31c
--- /dev/null
+++ b/flang/runtime/Float128Math/trunc.cpp
@@ -0,0 +1,26 @@
+//===-- runtime/Float128Math/trunc.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Round to integer, toward zero.
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(TruncF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Trunc<RTNAME(TruncF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/y0.cpp b/flang/runtime/Float128Math/y0.cpp
new file mode 100644
index 000000000000..f3e2ee454aea
--- /dev/null
+++ b/flang/runtime/Float128Math/y0.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/y0.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(Y0F128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Y0<RTNAME(Y0F128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/y1.cpp b/flang/runtime/Float128Math/y1.cpp
new file mode 100644
index 000000000000..c117bbcb2b5a
--- /dev/null
+++ b/flang/runtime/Float128Math/y1.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/y1.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(Y1F128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Y1<RTNAME(Y1F128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/yn.cpp b/flang/runtime/Float128Math/yn.cpp
new file mode 100644
index 000000000000..237bc2866a0d
--- /dev/null
+++ b/flang/runtime/Float128Math/yn.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/yn.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(YnF128)(
+    int n, CppTypeFor<TypeCategory::Real, 16> x) {
+  return Yn<RTNAME(YnF128)>::invoke(n, x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/complex-reduction.c b/flang/runtime/complex-reduction.c
index d77e1c0a5500..72c31ce08b87 100644
--- a/flang/runtime/complex-reduction.c
+++ b/flang/runtime/complex-reduction.c
@@ -19,6 +19,11 @@ struct CppComplexDouble {
 struct CppComplexLongDouble {
   long double r, i;
 };
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+struct CppComplexFloat128 {
+  CFloat128Type r, i;
+};
+#endif
 
 /* Not all environments define CMPLXF, CMPLX, CMPLXL. */
 
@@ -70,6 +75,29 @@ static long_double_Complex_t CMPLXL(long double r, long double i) {
 #endif
 #endif
 
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+#ifndef CMPLXF128
+/*
+ * GCC 7.4.0 (currently minimum GCC version for llvm builds)
+ * supports __builtin_complex. For Clang, require >=12.0.
+ * Otherwise, rely on the memory layout compatibility.
+ */
+#if (defined(__clang_major__) && (__clang_major__ >= 12)) || defined(__GNUC__)
+#define CMPLXF128 __builtin_complex
+#else
+static CFloat128ComplexType CMPLXF128(CFloat128Type r, CFloat128Type i) {
+  union {
+    struct CppComplexFloat128 x;
+    CFloat128ComplexType result;
+  } u;
+  u.x.r = r;
+  u.x.i = i;
+  return u.result;
+}
+#endif
+#endif
+#endif
+
 /* RTNAME(SumComplex4) calls RTNAME(CppSumComplex4) with the same arguments
  * and converts the members of its C++ complex result to C _Complex.
  */
@@ -93,9 +121,10 @@ ADAPT_REDUCTION(SumComplex8, double_Complex_t, CppComplexDouble, CMPLX,
 #if LDBL_MANT_DIG == 64
 ADAPT_REDUCTION(SumComplex10, long_double_Complex_t, CppComplexLongDouble,
     CMPLXL, REDUCTION_ARGS, REDUCTION_ARG_NAMES)
-#elif LDBL_MANT_DIG == 113
-ADAPT_REDUCTION(SumComplex16, long_double_Complex_t, CppComplexLongDouble,
-    CMPLXL, REDUCTION_ARGS, REDUCTION_ARG_NAMES)
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+ADAPT_REDUCTION(SumComplex16, CFloat128ComplexType, CppComplexFloat128,
+    CMPLXF128, REDUCTION_ARGS, REDUCTION_ARG_NAMES)
 #endif
 
 /* PRODUCT() */
@@ -106,9 +135,10 @@ ADAPT_REDUCTION(ProductComplex8, double_Complex_t, CppComplexDouble, CMPLX,
 #if LDBL_MANT_DIG == 64
 ADAPT_REDUCTION(ProductComplex10, long_double_Complex_t, CppComplexLongDouble,
     CMPLXL, REDUCTION_ARGS, REDUCTION_ARG_NAMES)
-#elif LDBL_MANT_DIG == 113
-ADAPT_REDUCTION(ProductComplex16, long_double_Complex_t, CppComplexLongDouble,
-    CMPLXL, REDUCTION_ARGS, REDUCTION_ARG_NAMES)
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+ADAPT_REDUCTION(ProductComplex16, CFloat128ComplexType, CppComplexFloat128,
+    CMPLXF128, REDUCTION_ARGS, REDUCTION_ARG_NAMES)
 #endif
 
 /* DOT_PRODUCT() */
@@ -119,7 +149,8 @@ ADAPT_REDUCTION(DotProductComplex8, double_Complex_t, CppComplexDouble, CMPLX,
 #if LDBL_MANT_DIG == 64
 ADAPT_REDUCTION(DotProductComplex10, long_double_Complex_t,
     CppComplexLongDouble, CMPLXL, DOT_PRODUCT_ARGS, DOT_PRODUCT_ARG_NAMES)
-#elif LDBL_MANT_DIG == 113
-ADAPT_REDUCTION(DotProductComplex16, long_double_Complex_t,
-    CppComplexLongDouble, CMPLXL, DOT_PRODUCT_ARGS, DOT_PRODUCT_ARG_NAMES)
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+ADAPT_REDUCTION(DotProductComplex16, CFloat128ComplexType, CppComplexFloat128,
+    CMPLXF128, DOT_PRODUCT_ARGS, DOT_PRODUCT_ARG_NAMES)
 #endif
diff --git a/flang/runtime/complex-reduction.h b/flang/runtime/complex-reduction.h
index 5c4f1f5126e3..1d37b235d519 100644
--- a/flang/runtime/complex-reduction.h
+++ b/flang/runtime/complex-reduction.h
@@ -15,6 +15,7 @@
 #ifndef FORTRAN_RUNTIME_COMPLEX_REDUCTION_H_
 #define FORTRAN_RUNTIME_COMPLEX_REDUCTION_H_
 
+#include "flang/Common/float128.h"
 #include "flang/Runtime/entry-names.h"
 #include <complex.h>
 
@@ -40,14 +41,18 @@ float_Complex_t RTNAME(SumComplex3)(REDUCTION_ARGS);
 float_Complex_t RTNAME(SumComplex4)(REDUCTION_ARGS);
 double_Complex_t RTNAME(SumComplex8)(REDUCTION_ARGS);
 long_double_Complex_t RTNAME(SumComplex10)(REDUCTION_ARGS);
-long_double_Complex_t RTNAME(SumComplex16)(REDUCTION_ARGS);
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CFloat128ComplexType RTNAME(SumComplex16)(REDUCTION_ARGS);
+#endif
 
 float_Complex_t RTNAME(ProductComplex2)(REDUCTION_ARGS);
 float_Complex_t RTNAME(ProductComplex3)(REDUCTION_ARGS);
 float_Complex_t RTNAME(ProductComplex4)(REDUCTION_ARGS);
 double_Complex_t RTNAME(ProductComplex8)(REDUCTION_ARGS);
 long_double_Complex_t RTNAME(ProductComplex10)(REDUCTION_ARGS);
-long_double_Complex_t RTNAME(ProductComplex16)(REDUCTION_ARGS);
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CFloat128ComplexType RTNAME(ProductComplex16)(REDUCTION_ARGS);
+#endif
 
 #define DOT_PRODUCT_ARGS \
   const struct CppDescriptor *x, const struct CppDescriptor *y, \
@@ -60,6 +65,8 @@ float_Complex_t RTNAME(DotProductComplex3)(DOT_PRODUCT_ARGS);
 float_Complex_t RTNAME(DotProductComplex4)(DOT_PRODUCT_ARGS);
 double_Complex_t RTNAME(DotProductComplex8)(DOT_PRODUCT_ARGS);
 long_double_Complex_t RTNAME(DotProductComplex10)(DOT_PRODUCT_ARGS);
-long_double_Complex_t RTNAME(DotProductComplex16)(DOT_PRODUCT_ARGS);
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CFloat128ComplexType RTNAME(DotProductComplex16)(DOT_PRODUCT_ARGS);
+#endif
 
 #endif // FORTRAN_RUNTIME_COMPLEX_REDUCTION_H_
diff --git a/flang/runtime/extrema.cpp b/flang/runtime/extrema.cpp
index 3fdc8e159866..fc2b4e165cb2 100644
--- a/flang/runtime/extrema.cpp
+++ b/flang/runtime/extrema.cpp
@@ -528,35 +528,6 @@ inline RT_API_ATTRS CppTypeFor<CAT, KIND> TotalNumericMaxOrMin(
       NumericExtremumAccumulator<CAT, KIND, IS_MAXVAL>{x}, intrinsic);
 }
 
-template <TypeCategory CAT, int KIND, typename ACCUMULATOR>
-static RT_API_ATTRS void DoMaxMinNorm2(Descriptor &result, const Descriptor &x,
-    int dim, const Descriptor *mask, const char *intrinsic,
-    Terminator &terminator) {
-  using Type = CppTypeFor<CAT, KIND>;
-  ACCUMULATOR accumulator{x};
-  if (dim == 0 || x.rank() == 1) {
-    // Total reduction
-
-    // Element size of the destination descriptor is the same
-    // as the element size of the source.
-    result.Establish(x.type(), x.ElementBytes(), nullptr, 0, nullptr,
-        CFI_attribute_allocatable);
-    if (int stat{result.Allocate()}) {
-      terminator.Crash(
-          "%s: could not allocate memory for result; STAT=%d", intrinsic, stat);
-    }
-    DoTotalReduction<Type>(x, dim, mask, accumulator, intrinsic, terminator);
-    accumulator.GetResult(result.OffsetElement<Type>());
-  } else {
-    // Partial reduction
-
-    // Element size of the destination descriptor is the same
-    // as the element size of the source.
-    PartialReduction<ACCUMULATOR, CAT, KIND>(result, x, x.ElementBytes(), dim,
-        mask, terminator, intrinsic, accumulator);
-  }
-}
-
 template <TypeCategory CAT, bool IS_MAXVAL> struct MaxOrMinHelper {
   template <int KIND> struct Functor {
     RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x,
@@ -802,66 +773,11 @@ RT_EXT_API_GROUP_END
 
 // NORM2
 
-RT_VAR_GROUP_BEGIN
-
-// Use at least double precision for accumulators.
-// Don't use __float128, it doesn't work with abs() or sqrt() yet.
-static constexpr RT_CONST_VAR_ATTRS int largestLDKind {
-#if LDBL_MANT_DIG == 113
-  16
-#elif LDBL_MANT_DIG == 64
-  10
-#else
-  8
-#endif
-};
-
-RT_VAR_GROUP_END
-
-template <int KIND> class Norm2Accumulator {
-public:
-  using Type = CppTypeFor<TypeCategory::Real, KIND>;
-  using AccumType =
-      CppTypeFor<TypeCategory::Real, std::clamp(KIND, 8, largestLDKind)>;
-  explicit RT_API_ATTRS Norm2Accumulator(const Descriptor &array)
-      : array_{array} {}
-  RT_API_ATTRS void Reinitialize() { max_ = sum_ = 0; }
-  template <typename A>
-  RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
-    // m * sqrt(1 + sum((others(:)/m)**2))
-    *p = static_cast<Type>(max_ * std::sqrt(1 + sum_));
-  }
-  RT_API_ATTRS bool Accumulate(Type x) {
-    auto absX{std::abs(static_cast<AccumType>(x))};
-    if (!max_) {
-      max_ = absX;
-    } else if (absX > max_) {
-      auto t{max_ / absX}; // < 1.0
-      auto tsq{t * t};
-      sum_ *= tsq; // scale sum to reflect change to the max
-      sum_ += tsq; // include a term for the previous max
-      max_ = absX;
-    } else { // absX <= max_
-      auto t{absX / max_};
-      sum_ += t * t;
-    }
-    return true;
-  }
-  template <typename A>
-  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
-    return Accumulate(*array_.Element<A>(at));
-  }
-
-private:
-  const Descriptor &array_;
-  AccumType max_{0}; // value (m) with largest magnitude
-  AccumType sum_{0}; // sum((others(:)/m)**2)
-};
-
 template <int KIND> struct Norm2Helper {
   RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x, int dim,
       const Descriptor *mask, Terminator &terminator) const {
-    DoMaxMinNorm2<TypeCategory::Real, KIND, Norm2Accumulator<KIND>>(
+    DoMaxMinNorm2<TypeCategory::Real, KIND,
+        typename Norm2AccumulatorGetter<KIND>::Type>(
         result, x, dim, mask, "NORM2", terminator);
   }
 };
@@ -872,26 +788,27 @@ RT_EXT_API_GROUP_BEGIN
 // TODO: REAL(2 & 3)
 CppTypeFor<TypeCategory::Real, 4> RTDEF(Norm2_4)(
     const Descriptor &x, const char *source, int line, int dim) {
-  return GetTotalReduction<TypeCategory::Real, 4>(
-      x, source, line, dim, nullptr, Norm2Accumulator<4>{x}, "NORM2");
+  return GetTotalReduction<TypeCategory::Real, 4>(x, source, line, dim, nullptr,
+      Norm2AccumulatorGetter<4>::create(x), "NORM2");
 }
 CppTypeFor<TypeCategory::Real, 8> RTDEF(Norm2_8)(
     const Descriptor &x, const char *source, int line, int dim) {
-  return GetTotalReduction<TypeCategory::Real, 8>(
-      x, source, line, dim, nullptr, Norm2Accumulator<8>{x}, "NORM2");
+  return GetTotalReduction<TypeCategory::Real, 8>(x, source, line, dim, nullptr,
+      Norm2AccumulatorGetter<8>::create(x), "NORM2");
 }
 #if LDBL_MANT_DIG == 64
 CppTypeFor<TypeCategory::Real, 10> RTDEF(Norm2_10)(
     const Descriptor &x, const char *source, int line, int dim) {
-  return GetTotalReduction<TypeCategory::Real, 10>(
-      x, source, line, dim, nullptr, Norm2Accumulator<10>{x}, "NORM2");
+  return GetTotalReduction<TypeCategory::Real, 10>(x, source, line, dim,
+      nullptr, Norm2AccumulatorGetter<10>::create(x), "NORM2");
 }
 #endif
 #if LDBL_MANT_DIG == 113
+// The __float128 implementation resides in FortranFloat128Math library.
 CppTypeFor<TypeCategory::Real, 16> RTDEF(Norm2_16)(
     const Descriptor &x, const char *source, int line, int dim) {
-  return GetTotalReduction<TypeCategory::Real, 16>(
-      x, source, line, dim, nullptr, Norm2Accumulator<16>{x}, "NORM2");
+  return GetTotalReduction<TypeCategory::Real, 16>(x, source, line, dim,
+      nullptr, Norm2AccumulatorGetter<16>::create(x), "NORM2");
 }
 #endif
 
@@ -901,7 +818,7 @@ void RTDEF(Norm2Dim)(Descriptor &result, const Descriptor &x, int dim,
   auto type{x.type().GetCategoryAndKind()};
   RUNTIME_CHECK(terminator, type);
   if (type->first == TypeCategory::Real) {
-    ApplyFloatingPointKind<Norm2Helper, void>(
+    ApplyFloatingPointKind<Norm2Helper, void, true>(
         type->second, terminator, result, x, dim, nullptr, terminator);
   } else {
     terminator.Crash("NORM2: bad type code %d", x.type().raw());
diff --git a/flang/runtime/product.cpp b/flang/runtime/product.cpp
index a516bc51a959..4c3b8c33a12e 100644
--- a/flang/runtime/product.cpp
+++ b/flang/runtime/product.cpp
@@ -123,7 +123,8 @@ CppTypeFor<TypeCategory::Real, 10> RTDEF(ProductReal10)(const Descriptor &x,
       NonComplexProductAccumulator<CppTypeFor<TypeCategory::Real, 10>>{x},
       "PRODUCT");
 }
-#elif LDBL_MANT_DIG == 113
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(ProductReal16)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Real, 16>(x, source, line, dim, mask,
@@ -154,7 +155,8 @@ void RTDEF(CppProductComplex10)(CppTypeFor<TypeCategory::Complex, 10> &result,
       mask, ComplexProductAccumulator<CppTypeFor<TypeCategory::Real, 10>>{x},
       "PRODUCT");
 }
-#elif LDBL_MANT_DIG == 113
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 void RTDEF(CppProductComplex16)(CppTypeFor<TypeCategory::Complex, 16> &result,
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
diff --git a/flang/runtime/reduction-templates.h b/flang/runtime/reduction-templates.h
index 7d0f82d59a08..0891bc021ff7 100644
--- a/flang/runtime/reduction-templates.h
+++ b/flang/runtime/reduction-templates.h
@@ -25,6 +25,7 @@
 #include "tools.h"
 #include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/descriptor.h"
+#include <algorithm>
 
 namespace Fortran::runtime {
 
@@ -332,5 +333,119 @@ template <typename ACCUMULATOR> struct PartialLocationHelper {
   };
 };
 
+// NORM2 templates
+
+RT_VAR_GROUP_BEGIN
+
+// Use at least double precision for accumulators.
+// Don't use __float128, it doesn't work with abs() or sqrt() yet.
+static constexpr RT_CONST_VAR_ATTRS int Norm2LargestLDKind {
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+  16
+#elif LDBL_MANT_DIG == 64
+  10
+#else
+  8
+#endif
+};
+
+RT_VAR_GROUP_END
+
+template <TypeCategory CAT, int KIND, typename ACCUMULATOR>
+inline RT_API_ATTRS void DoMaxMinNorm2(Descriptor &result, const Descriptor &x,
+    int dim, const Descriptor *mask, const char *intrinsic,
+    Terminator &terminator) {
+  using Type = CppTypeFor<CAT, KIND>;
+  ACCUMULATOR accumulator{x};
+  if (dim == 0 || x.rank() == 1) {
+    // Total reduction
+
+    // Element size of the destination descriptor is the same
+    // as the element size of the source.
+    result.Establish(x.type(), x.ElementBytes(), nullptr, 0, nullptr,
+        CFI_attribute_allocatable);
+    if (int stat{result.Allocate()}) {
+      terminator.Crash(
+          "%s: could not allocate memory for result; STAT=%d", intrinsic, stat);
+    }
+    DoTotalReduction<Type>(x, dim, mask, accumulator, intrinsic, terminator);
+    accumulator.GetResult(result.OffsetElement<Type>());
+  } else {
+    // Partial reduction
+
+    // Element size of the destination descriptor is the same
+    // as the element size of the source.
+    PartialReduction<ACCUMULATOR, CAT, KIND>(result, x, x.ElementBytes(), dim,
+        mask, terminator, intrinsic, accumulator);
+  }
+}
+
+// The data type used by Norm2Accumulator.
+template <int KIND>
+using Norm2AccumType =
+    CppTypeFor<TypeCategory::Real, std::clamp(KIND, 8, Norm2LargestLDKind)>;
+
+template <int KIND, typename ABS, typename SQRT> class Norm2Accumulator {
+public:
+  using Type = CppTypeFor<TypeCategory::Real, KIND>;
+  using AccumType = Norm2AccumType<KIND>;
+  explicit RT_API_ATTRS Norm2Accumulator(const Descriptor &array)
+      : array_{array} {}
+  RT_API_ATTRS void Reinitialize() { max_ = sum_ = 0; }
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
+    // m * sqrt(1 + sum((others(:)/m)**2))
+    *p = static_cast<Type>(max_ * SQRT::compute(1 + sum_));
+  }
+  RT_API_ATTRS bool Accumulate(Type x) {
+    auto absX{ABS::compute(static_cast<AccumType>(x))};
+    if (!max_) {
+      max_ = absX;
+    } else if (absX > max_) {
+      auto t{max_ / absX}; // < 1.0
+      auto tsq{t * t};
+      sum_ *= tsq; // scale sum to reflect change to the max
+      sum_ += tsq; // include a term for the previous max
+      max_ = absX;
+    } else { // absX <= max_
+      auto t{absX / max_};
+      sum_ += t * t;
+    }
+    return true;
+  }
+  template <typename A>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
+    return Accumulate(*array_.Element<A>(at));
+  }
+
+private:
+  const Descriptor &array_;
+  AccumType max_{0}; // value (m) with largest magnitude
+  AccumType sum_{0}; // sum((others(:)/m)**2)
+};
+
+// Helper class for creating Norm2Accumulator instance
+// based on the given KIND. This helper returns and instance
+// that uses std::abs and std::sqrt for the computations.
+template <int KIND> class Norm2AccumulatorGetter {
+  using AccumType = Norm2AccumType<KIND>;
+
+public:
+  struct ABSTy {
+    static constexpr RT_API_ATTRS AccumType compute(AccumType &&x) {
+      return std::abs(std::forward<AccumType>(x));
+    }
+  };
+  struct SQRTTy {
+    static constexpr RT_API_ATTRS AccumType compute(AccumType &&x) {
+      return std::sqrt(std::forward<AccumType>(x));
+    }
+  };
+
+  using Type = Norm2Accumulator<KIND, ABSTy, SQRTTy>;
+
+  static RT_API_ATTRS Type create(const Descriptor &x) { return Type(x); }
+};
+
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_REDUCTION_TEMPLATES_H_
diff --git a/flang/runtime/sum.cpp b/flang/runtime/sum.cpp
index 048399737c85..d2495e3e956f 100644
--- a/flang/runtime/sum.cpp
+++ b/flang/runtime/sum.cpp
@@ -175,7 +175,8 @@ void RTDEF(CppSumComplex10)(CppTypeFor<TypeCategory::Complex, 10> &result,
   result = GetTotalReduction<TypeCategory::Complex, 10>(
       x, source, line, dim, mask, ComplexSumAccumulator<long double>{x}, "SUM");
 }
-#elif LDBL_MANT_DIG == 113
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 void RTDEF(CppSumComplex16)(CppTypeFor<TypeCategory::Complex, 16> &result,
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
diff --git a/flang/runtime/tools.h b/flang/runtime/tools.h
index 89e506999574..c1f89cadca06 100644
--- a/flang/runtime/tools.h
+++ b/flang/runtime/tools.h
@@ -266,7 +266,8 @@ inline RT_API_ATTRS RESULT ApplyIntegerKind(
   }
 }
 
-template <template <int KIND> class FUNC, typename RESULT, typename... A>
+template <template <int KIND> class FUNC, typename RESULT,
+    bool NEEDSMATH = false, typename... A>
 inline RT_API_ATTRS RESULT ApplyFloatingPointKind(
     int kind, Terminator &terminator, A &&...x) {
   switch (kind) {
@@ -287,7 +288,13 @@ inline RT_API_ATTRS RESULT ApplyFloatingPointKind(
     break;
   case 16:
     if constexpr (HasCppTypeFor<TypeCategory::Real, 16>) {
-      return FUNC<16>{}(std::forward<A>(x)...);
+      // If FUNC implemenation relies on FP math functions,
+      // then we should not be here. The compiler should have
+      // generated a call to an entry in FortranFloat128Math
+      // library.
+      if constexpr (!NEEDSMATH) {
+        return FUNC<16>{}(std::forward<A>(x)...);
+      }
     }
     break;
   }
diff --git a/flang/test/Driver/linker-flags.f90 b/flang/test/Driver/linker-flags.f90
index 5e00520fcc09..4d3d528b5e99 100644
--- a/flang/test/Driver/linker-flags.f90
+++ b/flang/test/Driver/linker-flags.f90
@@ -2,15 +2,15 @@
 ! invocation. These libraries are added on top of other standard runtime
 ! libraries that the Clang driver will include.
 
-! RUN: %flang -### --target=ppc64le-linux-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX
-! RUN: %flang -### --target=aarch64-apple-darwin %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,DARWIN
-! RUN: %flang -### --target=sparc-sun-solaris2.11 %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX
-! RUN: %flang -### --target=x86_64-unknown-freebsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX
-! RUN: %flang -### --target=x86_64-unknown-netbsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX
-! RUN: %flang -### --target=x86_64-unknown-openbsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX
-! RUN: %flang -### --target=x86_64-unknown-dragonfly %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX
-! RUN: %flang -### --target=x86_64-unknown-haiku %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,HAIKU
-! RUN: %flang -### --target=x86_64-windows-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,MINGW
+! RUN: %flang -### --target=ppc64le-linux-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,UNIX-F128%f128-lib
+! RUN: %flang -### --target=aarch64-apple-darwin %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,DARWIN,DARWIN-F128%f128-lib
+! RUN: %flang -### --target=sparc-sun-solaris2.11 %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,SOLARIS-F128%f128-lib
+! RUN: %flang -### --target=x86_64-unknown-freebsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,UNIX-F128%f128-lib
+! RUN: %flang -### --target=x86_64-unknown-netbsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,UNIX-F128%f128-lib
+! RUN: %flang -### --target=x86_64-unknown-openbsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,UNIX-F128%f128-lib
+! RUN: %flang -### --target=x86_64-unknown-dragonfly %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,UNIX-F128%f128-lib
+! RUN: %flang -### --target=x86_64-unknown-haiku %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,HAIKU,HAIKU-F128%f128-lib
+! RUN: %flang -### --target=x86_64-windows-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,MINGW,MINGW-F128%f128-lib
 ! RUN: %flang -### --target=aarch64-unknown-linux-gnu %S/Inputs/hello.f90 -lFortran_main 2>&1 | FileCheck %s --check-prefixes=DEPRECATED
 
 ! NOTE: Clang's driver library, clangDriver, usually adds 'oldnames' on Windows,
@@ -29,21 +29,33 @@
 !       executable and may find the GNU linker from MinGW or Cygwin.
 ! UNIX-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! UNIX-SAME: "[[object_file]]"
-! UNIX-SAME: "--whole-archive" "-lFortran_main" "--no-whole-archive" "-lFortranRuntime" "-lFortranDecimal" "-lm"
+! UNIX-SAME: "--whole-archive" "-lFortran_main" "--no-whole-archive"
+! UNIX-F128NONE-NOT: FortranFloat128Math
+! SOLARIS-F128NONE-NOT: FortranFloat128Math
+! UNIX-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed"
+! SOLARIS-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "-z" "ignore" "-lquadmath" "-z" "record"
+! UNIX-SAME: "-lFortranRuntime" "-lFortranDecimal" "-lm"
 
 ! DARWIN-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! DARWIN-SAME: "[[object_file]]"
 ! DARWIN-SAME: -lFortran_main
+! DARWIN-F128NONE-NOT: FortranFloat128Math
+! DARWIN-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed"
 ! DARWIN-SAME: -lFortranRuntime
 ! DARWIN-SAME: -lFortranDecimal
 
 ! HAIKU-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! HAIKU-SAME: "[[object_file]]"
-! HAIKU-SAME: "--whole-archive" "-lFortran_main" "--no-whole-archive" "-lFortranRuntime" "-lFortranDecimal"
+! HAIKU-SAME: "--whole-archive" "-lFortran_main" "--no-whole-archive"
+! HAIKU-F128NONE-NOT: FortranFloat128Math
+! HAIKU-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed"
+! HAIKU-SAME: "-lFortranRuntime" "-lFortranDecimal"
 
 ! MINGW-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! MINGW-SAME: "[[object_file]]"
 ! MINGW-SAME: -lFortran_main
+! MINGW-F128NONE-NOT: FortranFloat128Math
+! MINGW-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed"
 ! MINGW-SAME: -lFortranRuntime
 ! MINGW-SAME: -lFortranDecimal
 
diff --git a/flang/test/Integration/OpenMP/copyprivate.f90 b/flang/test/Integration/OpenMP/copyprivate.f90
new file mode 100644
index 000000000000..d32319a18c28
--- /dev/null
+++ b/flang/test/Integration/OpenMP/copyprivate.f90
@@ -0,0 +1,97 @@
+!===----------------------------------------------------------------------===!
+! This directory can be used to add Integration tests involving multiple
+! stages of the compiler (for eg. from Fortran to LLVM IR). It should not
+! contain executable tests. We should only add tests here sparingly and only
+! if there is no other way to test. Repeat this message in each test that is
+! added to this directory and sub-directories.
+!===----------------------------------------------------------------------===!
+
+!RUN: %flang_fc1 -emit-llvm -fopenmp %s -o - | FileCheck %s
+
+!CHECK-DAG: define void @_copy_box_Uxi32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_10xi32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_i64(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_box_Uxi64(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_f32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_2x3xf32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_z32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_10xz32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_l32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_5xl32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_c8x8(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_10xc8x8(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_c16x5(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_rec__QFtest_typesTdt(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_box_heap_Uxi32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_box_ptr_Uxc8x9(ptr %{{.*}}, ptr %{{.*}})
+
+!CHECK-LABEL: define void @_copy_i32(
+!CHECK-SAME:                         ptr %[[DST:.*]], ptr %[[SRC:.*]]){{.*}} {
+!CHECK-NEXT:    %[[SRC_VAL:.*]] = load i32, ptr %[[SRC]]
+!CHECK-NEXT:    store i32 %[[SRC_VAL]], ptr %[[DST]]
+!CHECK-NEXT:    ret void
+!CHECK-NEXT:  }
+
+!CHECK-LABEL: define internal void @test_scalar_..omp_par({{.*}})
+!CHECK:         %[[I:.*]] = alloca i32, i64 1
+!CHECK:         %[[J:.*]] = alloca i32, i64 1
+!CHECK:         %[[DID_IT:.*]] = alloca i32
+!CHECK:         store i32 0, ptr %[[DID_IT]]
+!CHECK:         %[[THREAD_NUM1:.*]] = call i32 @__kmpc_global_thread_num(ptr @[[LOC:.*]])
+!CHECK:         %[[RET:.*]] = call i32 @__kmpc_single({{.*}})
+!CHECK:         %[[NOT_ZERO:.*]] = icmp ne i32 %[[RET]], 0
+!CHECK:         br i1 %[[NOT_ZERO]], label %[[OMP_REGION_BODY:.*]], label %[[OMP_REGION_END:.*]]
+
+!CHECK:       [[OMP_REGION_END]]:
+!CHECK:         %[[THREAD_NUM2:.*]] = call i32 @__kmpc_global_thread_num(ptr @[[LOC:.*]])
+!CHECK:         %[[DID_IT_VAL:.*]] = load i32, ptr %[[DID_IT]]
+!CHECK:         call void @__kmpc_copyprivate(ptr @[[LOC]], i32 %[[THREAD_NUM2]], i64 0, ptr %[[I]], ptr @_copy_i32, i32 %[[DID_IT_VAL]])
+!CHECK:         %[[THREAD_NUM3:.*]] = call i32 @__kmpc_global_thread_num(ptr @[[LOC]])
+!CHECK:         %[[DID_IT_VAL2:.*]] = load i32, ptr %[[DID_IT]]
+!CHECK:         call void @__kmpc_copyprivate(ptr @[[LOC]], i32 %[[THREAD_NUM3]], i64 0, ptr %[[J]], ptr @_copy_i32, i32 %[[DID_IT_VAL2]])
+
+!CHECK:       [[OMP_REGION_BODY]]:
+!CHECK:         br label %[[OMP_SINGLE_REGION:.*]]
+!CHECK:       [[OMP_SINGLE_REGION]]:
+!CHECK:         store i32 11, ptr %[[I]]
+!CHECK:         store i32 22, ptr %[[J]]
+!CHECK:         br label %[[OMP_REGION_CONT3:.*]]
+!CHECK:       [[OMP_REGION_CONT3:.*]]:
+!CHECK:         store i32 1, ptr %[[DID_IT]]
+!CHECK:         call void @__kmpc_end_single(ptr @[[LOC]], i32 %[[THREAD_NUM1]])
+!CHECK:         br label %[[OMP_REGION_END]]
+subroutine test_scalar()
+  integer :: i, j
+
+  !$omp parallel private(i, j)
+  !$omp single
+  i = 11
+  j = 22
+  !$omp end single copyprivate(i, j)
+  !$omp end parallel
+end subroutine
+
+subroutine test_types(a, n)
+  integer :: a(:), n
+  integer(4) :: i4, i4a(10)
+  integer(8) :: i8, i8a(n)
+  real :: r, ra(2, 3)
+  complex :: z, za(10)
+  logical :: l, la(5)
+  character(kind=1, len=8) :: c1, c1a(10)
+  character(kind=2, len=5) :: c2
+
+  type dt
+    integer :: i
+    real :: r
+  end type
+  type(dt) :: t
+
+  integer, allocatable :: aloc(:)
+  character(kind=1, len=9), pointer :: ptr(:)
+
+  !$omp parallel private(a, i4, i4a, i8, i8a, r, ra, z, za, l, la, c1, c1a, c2, t, aloc, ptr)
+  !$omp single
+  !$omp end single copyprivate(a, i4, i4a, i8, i8a, r, ra, z, za, l, la, c1, c1a, c2, t, aloc, ptr)
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Integration/iso-fortran-binding.cpp b/flang/test/Integration/iso-fortran-binding.cpp
new file mode 100644
index 000000000000..aaafd7cccd07
--- /dev/null
+++ b/flang/test/Integration/iso-fortran-binding.cpp
@@ -0,0 +1,33 @@
+// UNSUPPORTED: system-windows
+// RUN: split-file %s %t
+// RUN: chmod +x %t/runtest.sh
+// RUN: %t/runtest.sh %t %t/cppfile.cpp %flang | FileCheck %s
+
+//--- cppfile.cpp
+extern "C" {
+#include "ISO_Fortran_binding.h"
+}
+#include <iostream>
+
+int main() {
+  std::cout << "PASS\n";
+  return 0;
+}
+
+// CHECK: PASS
+// clang-format off
+//--- runtest.sh
+#!/bin/bash
+TMPDIR=$1
+CPPFILE=$2
+FLANG=$3
+BINDIR=`dirname $FLANG`
+CPPCOMP=$BINDIR/clang++
+if [ -x $CPPCOMP ]
+then
+  $CPPCOMP $CPPFILE -o $TMPDIR/a.out
+  $TMPDIR/a.out # should print "PASS"
+else
+  # No clang compiler, just pass by default
+  echo "PASS"
+fi
diff --git a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
new file mode 100644
index 000000000000..db628fe756b9
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
@@ -0,0 +1,51 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test lowering of CUDA kernel loop directive.
+
+subroutine sub1()
+  integer :: i, j
+  integer, parameter :: n = 100
+  real :: a(n), b(n)
+  real :: c(n,n), d(n,n)
+
+! CHECK-LABEL: func.func @_QPsub1()
+! CHECK: %[[IV:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+  !$cuf kernel do <<< 1, 2 >>>
+  do i = 1, n
+    a(i) = a(i) * b(i)
+  end do
+
+! CHECK: %[[LB:.*]] = fir.convert %c1{{.*}} : (i32) -> index
+! CHECK: %[[UB:.*]] = fir.convert %c100{{.*}} : (i32) -> index
+! CHECK: %[[STEP:.*]] = arith.constant 1 : index
+! CHECK: fir.cuda_kernel<<<%c1_i32, %c2_i32>>> (%[[ARG0:.*]] : index) = (%[[LB]] : index) to (%[[UB]] : index)  step (%[[STEP]] : index)
+! CHECK-NOT: fir.do_loop
+! CHECK: %[[ARG0_I32:.*]] = fir.convert %[[ARG0]] : (index) -> i32
+! CHECK: fir.store %[[ARG0_I32]] to %[[IV]]#1 : !fir.ref<i32>
+
+
+  !$cuf kernel do <<< *, * >>>
+  do i = 1, n
+    a(i) = a(i) * b(i)
+  end do
+
+! CHECK: fir.cuda_kernel<<<*, *>>> (%{{.*}} : index) = (%{{.*}} : index) to (%{{.*}} : index)  step (%{{.*}} : index)
+
+  !$cuf kernel do(2) <<< 1, (256,1) >>>
+  do i = 1, n
+    do j = 1, n
+      c(i,j) = c(i,j) * d(i,j)
+    end do
+  end do
+
+! CHECK: fir.cuda_kernel<<<%c1{{.*}}, (%c256{{.*}}, %c1{{.*}})>>> (%{{.*}} : index, %{{.*}} : index) = (%{{.*}}, %{{.*}} : index, index) to (%{{.*}}, %{{.*}} : index, index) step (%{{.*}}, %{{.*}} : index, index)
+! CHECK: {n = 2 : i64}
+
+! TODO: currently these trigger error in the parser
+! !$cuf kernel do(2) <<< (1,*), (256,1) >>>
+! !$cuf kernel do(2) <<< (*,*), (32,4) >>>
+end
+
+
+
diff --git a/flang/test/Lower/CUDA/cuda-mod.cuf b/flang/test/Lower/CUDA/cuda-mod.cuf
new file mode 100644
index 000000000000..ae5bf63d2da4
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-mod.cuf
@@ -0,0 +1,15 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Simple module to test module use in 
+
+module cuf_mod
+  real, device :: md
+
+contains
+  attributes(device) subroutine devsub()
+  end
+end module
+
+! CHECK: fir.global @_QMcuf_modEmd {cuda_attr = #fir.cuda<device>} : f32
+
+! CHECK: func.func @_QMcuf_modPdevsub() attributes {fir.cuda_attr = #fir.cuda_proc<device>}
diff --git a/flang/test/Lower/CUDA/cuda-module-use.cuf b/flang/test/Lower/CUDA/cuda-module-use.cuf
new file mode 100644
index 000000000000..f54083b026ee
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-module-use.cuf
@@ -0,0 +1,25 @@
+! RUN: bbc -emit-hlfir -fcuda %S/cuda-mod.cuf
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test importing module containing variable and subroutine with CUDA attributes.
+
+subroutine sub1()
+  use cuf_mod
+  md = 1.0
+end
+
+! CHECK-LABEL: func.func @_QPsub1()
+! CHECK: %[[ADDR:.*]] = fir.address_of(@_QMcuf_modEmd) : !fir.ref<f32>
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ADDR]] {cuda_attr = #fir.cuda<device>, uniq_name = "_QMcuf_modEmd"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+
+attributes(device) subroutine sub2()
+  use cuf_mod
+  call devsub()
+end
+
+! CHECK-LABEL: func.func @_QPsub2() attributes {fir.cuda_attr = #fir.cuda_proc<device>}
+! CHECK: fir.call @_QMcuf_modPdevsub()
+
+! CHECK-LABEL: fir.global @_QMcuf_modEmd {cuda_attr = #fir.cuda<device>} : f32
+
+! CHECK-LABEL: func.func private @_QMcuf_modPdevsub() attributes {fir.cuda_attr = #fir.cuda_proc<device>}
diff --git a/flang/test/Lower/HLFIR/allocatable-end-of-scope-dealloc.f90 b/flang/test/Lower/HLFIR/allocatable-end-of-scope-dealloc.f90
index ad4b015ef944..05cae6e5ba6c 100644
--- a/flang/test/Lower/HLFIR/allocatable-end-of-scope-dealloc.f90
+++ b/flang/test/Lower/HLFIR/allocatable-end-of-scope-dealloc.f90
@@ -224,7 +224,7 @@ contains
     allocate(x)
   end subroutine
 end subroutine
-! CHECK-LABEL:   func.func @_QFno_dealloc_host_assocPinternal
+! CHECK-LABEL:   func.func private @_QFno_dealloc_host_assocPinternal
 ! CHECK-NOT: freemem
 ! CHECK-NOT: Deallocate
 ! CHECK: return
diff --git a/flang/test/Lower/HLFIR/bindc_internal_proc.f90 b/flang/test/Lower/HLFIR/bindc_internal_proc.f90
index 027c94f95a32..00e24c7016f1 100644
--- a/flang/test/Lower/HLFIR/bindc_internal_proc.f90
+++ b/flang/test/Lower/HLFIR/bindc_internal_proc.f90
@@ -3,7 +3,7 @@
 ! internal procedures.
 ! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
 
-!CHECK: func.func @_QFsub1Pfoo(%{{.*}}: i32
+!CHECK: func.func private @_QFsub1Pfoo(%{{.*}}: i32
 subroutine sub1()
   call foo(42)
 contains
@@ -13,7 +13,7 @@ contains
   end subroutine
 end subroutine
 
-!CHECK: func.func @_QFsub2Pfoo(%{{.*}}: i64
+!CHECK: func.func private @_QFsub2Pfoo(%{{.*}}: i64
 subroutine sub2()
   call foo(42_8)
 contains
diff --git a/flang/test/Lower/HLFIR/internal-procedures-2.f90 b/flang/test/Lower/HLFIR/internal-procedures-2.f90
index bb05545bef1a..f1c4780954b2 100644
--- a/flang/test/Lower/HLFIR/internal-procedures-2.f90
+++ b/flang/test/Lower/HLFIR/internal-procedures-2.f90
@@ -23,7 +23,7 @@ contains
   end forall
  end subroutine
 end subroutine
-! CHECK-LABEL: func.func @_QFhost_procedurePinternal_procedure(
+! CHECK-LABEL: func.func private @_QFhost_procedurePinternal_procedure(
 ! CHECK:    fir.address_of(@_QMmodule_used_by_hostEindexed_by_var) : !fir.ref<!fir.array<2xi32>>
 ! CHECK:    fir.address_of(@_QMmodule_used_by_hostEref_in_forall) : !fir.ref<!fir.array<2xi32>>
 ! CHECK:    fir.address_of(@_QMmodule_used_by_hostEref_in_implied_do) : !fir.ref<i32>
diff --git a/flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90 b/flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90
new file mode 100644
index 000000000000..8645488290d7
--- /dev/null
+++ b/flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90
@@ -0,0 +1,81 @@
+! Test lowering of internal procedure capturing OPTIONAL polymorphic
+! objects.
+! RUN: bbc -emit-hlfir --polymorphic-type -o - %s -I nw | FileCheck %s
+
+
+module captured_optional_polymorphic
+  type sometype
+  end type
+contains
+subroutine test(x, y)
+  class(sometype), optional :: x
+  class(sometype), optional :: y(2:)
+  call internal()
+contains
+  subroutine internal()
+    if (present(x).and.present(y)) then
+      print *, same_type_as(x, y)
+    end if
+  end subroutine
+end
+end module
+
+! CHECK-LABEL:   func.func @_QMcaptured_optional_polymorphicPtest(
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare{{.*}}Ex
+! CHECK:           %[[VAL_3:.*]] = arith.constant 2 : i64
+! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i64) -> index
+! CHECK:           %[[VAL_5:.*]] = fir.shift %[[VAL_4]] : (index) -> !fir.shift<1>
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare{{.*}}Ey
+! CHECK:           %[[VAL_7:.*]] = fir.alloca tuple<!fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>, !fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>>
+! CHECK:           %[[VAL_8:.*]] = arith.constant 0 : i32
+! CHECK:           %[[VAL_9:.*]] = fir.coordinate_of %[[VAL_7]], %[[VAL_8]]
+! CHECK:           %[[VAL_10:.*]] = fir.is_present %[[VAL_2]]#1 : (!fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>) -> i1
+! CHECK:           fir.if %[[VAL_10]] {
+! CHECK:             fir.store %[[VAL_2]]#1 to %[[VAL_9]] : !fir.ref<!fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>>
+! CHECK:           } else {
+! CHECK:             %[[VAL_11:.*]] = fir.zero_bits !fir.ref<!fir.type<_QMcaptured_optional_polymorphicTsometype>>
+! CHECK:             %[[VAL_12:.*]] = fir.embox %[[VAL_11]] : (!fir.ref<!fir.type<_QMcaptured_optional_polymorphicTsometype>>) -> !fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>
+! CHECK:             fir.store %[[VAL_12]] to %[[VAL_9]] : !fir.ref<!fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>>
+! CHECK:           }
+! CHECK:           %[[VAL_13:.*]] = arith.constant 1 : i32
+! CHECK:           %[[VAL_14:.*]] = fir.coordinate_of %[[VAL_7]], %[[VAL_13]]
+! CHECK:           %[[VAL_15:.*]] = fir.is_present %[[VAL_6]]#1 : (!fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>) -> i1
+! CHECK:           fir.if %[[VAL_15]] {
+! CHECK:             %[[VAL_16:.*]] = fir.shift %[[VAL_4]] : (index) -> !fir.shift<1>
+! CHECK:             %[[VAL_17:.*]] = fir.rebox %[[VAL_6]]#1(%[[VAL_16]]) : (!fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>, !fir.shift<1>) -> !fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>
+! CHECK:             fir.store %[[VAL_17]] to %[[VAL_14]] : !fir.ref<!fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>>
+! CHECK:           } else {
+! CHECK:             %[[VAL_18:.*]] = fir.type_desc !fir.type<_QMcaptured_optional_polymorphicTsometype>
+! CHECK:             %[[VAL_19:.*]] = fir.convert %[[VAL_14]] : (!fir.ref<!fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>>) -> !fir.ref<!fir.box<none>>
+! CHECK:             %[[VAL_20:.*]] = fir.convert %[[VAL_18]] : (!fir.tdesc<!fir.type<_QMcaptured_optional_polymorphicTsometype>>) -> !fir.ref<none>
+! CHECK:             %[[VAL_21:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_22:.*]] = arith.constant 0 : i32
+! CHECK:             %[[VAL_23:.*]] = fir.call @_FortranAPointerNullifyDerived(%[[VAL_19]], %[[VAL_20]], %[[VAL_21]], %[[VAL_22]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> none
+! CHECK:           }
+! CHECK:           fir.call @_QMcaptured_optional_polymorphicFtestPinternal(%[[VAL_7]])
+
+! CHECK-LABEL: func.func{{.*}} @_QMcaptured_optional_polymorphicFtestPinternal(
+! CHECK-SAME:      %[[VAL_0:.*]]: !fir.ref<tuple<{{.*}}>>
+! CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i32
+! CHECK:           %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]]
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>>
+! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>) -> !fir.ref<!fir.type<_QMcaptured_optional_polymorphicTsometype>>
+! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.type<_QMcaptured_optional_polymorphicTsometype>>) -> i64
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
+! CHECK:           %[[VAL_7:.*]] = arith.cmpi ne, %[[VAL_5]], %[[VAL_6]] : i64
+! CHECK:           %[[VAL_8:.*]] = fir.absent !fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>
+! CHECK:           %[[VAL_9:.*]] = arith.select %[[VAL_7]], %[[VAL_3]], %[[VAL_8]] : !fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_9]] {fortran_attrs = #fir.var_attrs<optional, host_assoc>, {{.*}}Ex
+! CHECK:           %[[VAL_11:.*]] = arith.constant 1 : i32
+! CHECK:           %[[VAL_12:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_11]]
+! CHECK:           %[[VAL_13:.*]] = fir.load %[[VAL_12]] : !fir.ref<!fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>>
+! CHECK:           %[[VAL_14:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_15:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_14]]
+! CHECK:           %[[VAL_16:.*]] = fir.box_addr %[[VAL_13]]
+! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (!fir.ref<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>) -> i64
+! CHECK:           %[[VAL_18:.*]] = arith.constant 0 : i64
+! CHECK:           %[[VAL_19:.*]] = arith.cmpi ne, %[[VAL_17]], %[[VAL_18]] : i64
+! CHECK:           %[[VAL_20:.*]] = fir.absent !fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>
+! CHECK:           %[[VAL_21:.*]] = arith.select %[[VAL_19]], %[[VAL_13]], %[[VAL_20]] : !fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>
+! CHECK:           %[[VAL_22:.*]] = fir.shift %[[VAL_15]]#0 : (index) -> !fir.shift<1>
+! CHECK:           %[[VAL_23:.*]]:2 = hlfir.declare %[[VAL_21]](%[[VAL_22]]) {fortran_attrs = #fir.var_attrs<optional, host_assoc>, {{.*}}Ey
diff --git a/flang/test/Lower/HLFIR/internal-procedures.f90 b/flang/test/Lower/HLFIR/internal-procedures.f90
index d517cb4345af..c898903b6fbe 100644
--- a/flang/test/Lower/HLFIR/internal-procedures.f90
+++ b/flang/test/Lower/HLFIR/internal-procedures.f90
@@ -9,8 +9,8 @@ subroutine internal
   call takes_array(x)
 end subroutine
 end subroutine
-! CHECK-LABEL: func.func @_QFtest_explicit_shape_arrayPinternal(
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.box<!fir.array<?xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func.func private @_QFtest_explicit_shape_arrayPinternal(
+! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.box<!fir.array<?xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<tuple<!fir.box<!fir.array<?xf32>>>>, i32) -> !fir.ref<!fir.box<!fir.array<?xf32>>>
 ! CHECK:  %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
@@ -27,8 +27,8 @@ subroutine internal
   call takes_array(x)
 end subroutine
 end subroutine
-! CHECK-LABEL: func.func @_QFtest_assumed_shapePinternal(
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.box<!fir.array<?xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func.func private @_QFtest_assumed_shapePinternal(
+! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.box<!fir.array<?xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<tuple<!fir.box<!fir.array<?xf32>>>>, i32) -> !fir.ref<!fir.box<!fir.array<?xf32>>>
 ! CHECK:  %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
@@ -44,8 +44,8 @@ subroutine internal()
   call bar(c)
 end subroutine
 end subroutine
-! CHECK-LABEL:   func.func @_QFtest_scalar_charPinternal(
-! CHECK-SAME:                               %[[VAL_0:.*]]: !fir.ref<tuple<!fir.boxchar<1>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL:   func.func private @_QFtest_scalar_charPinternal(
+! CHECK-SAME:                               %[[VAL_0:.*]]: !fir.ref<tuple<!fir.boxchar<1>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<tuple<!fir.boxchar<1>>>, i32) -> !fir.ref<!fir.boxchar<1>>
 ! CHECK:  %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.boxchar<1>>
diff --git a/flang/test/Lower/Intrinsics/acos_real16.f90 b/flang/test/Lower/Intrinsics/acos_real16.f90
new file mode 100644
index 000000000000..2a09bfe94a80
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/acos_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAAcosF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = acos(a)
+end
diff --git a/flang/test/Lower/Intrinsics/acosh_real16.f90 b/flang/test/Lower/Intrinsics/acosh_real16.f90
new file mode 100644
index 000000000000..de787e3d2b0f
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/acosh_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAAcoshF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = acosh(a)
+end
diff --git a/flang/test/Lower/Intrinsics/aint_real16.f90 b/flang/test/Lower/Intrinsics/aint_real16.f90
new file mode 100644
index 000000000000..b8b80ea3097c
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/aint_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranATruncF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = aint(a)
+end
diff --git a/flang/test/Lower/Intrinsics/anint_real16.f90 b/flang/test/Lower/Intrinsics/anint_real16.f90
new file mode 100644
index 000000000000..677240dc41df
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/anint_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranARoundF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = anint(a)
+end
diff --git a/flang/test/Lower/Intrinsics/asin_real16.f90 b/flang/test/Lower/Intrinsics/asin_real16.f90
new file mode 100644
index 000000000000..cb32d0a6af70
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/asin_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAAsinF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = asin(a)
+end
diff --git a/flang/test/Lower/Intrinsics/asinh_real16.f90 b/flang/test/Lower/Intrinsics/asinh_real16.f90
new file mode 100644
index 000000000000..9ab16f19d933
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/asinh_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAAsinhF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = asinh(a)
+end
diff --git a/flang/test/Lower/Intrinsics/atan2_real16.f90 b/flang/test/Lower/Intrinsics/atan2_real16.f90
new file mode 100644
index 000000000000..5d0bf3069342
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/atan2_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAAtan2F128({{.*}}){{.*}}: (f128, f128) -> f128
+  real(16) :: a, b
+  b = atan2(a, b)
+end
diff --git a/flang/test/Lower/Intrinsics/atan_real16.f90 b/flang/test/Lower/Intrinsics/atan_real16.f90
new file mode 100644
index 000000000000..5c0c262711c6
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/atan_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAAtanF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = atan(a)
+end
diff --git a/flang/test/Lower/Intrinsics/atanh_real16.f90 b/flang/test/Lower/Intrinsics/atanh_real16.f90
new file mode 100644
index 000000000000..0d60aecd08e1
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/atanh_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAAtanhF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = atanh(a)
+end
diff --git a/flang/test/Lower/Intrinsics/bessel_j0_real16.f90 b/flang/test/Lower/Intrinsics/bessel_j0_real16.f90
new file mode 100644
index 000000000000..f1c07f6137d8
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/bessel_j0_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAJ0F128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = bessel_j0(a)
+end
diff --git a/flang/test/Lower/Intrinsics/bessel_j1_real16.f90 b/flang/test/Lower/Intrinsics/bessel_j1_real16.f90
new file mode 100644
index 000000000000..c41e7b5246ad
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/bessel_j1_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAJ1F128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = bessel_j1(a)
+end
diff --git a/flang/test/Lower/Intrinsics/bessel_jn_real16.f90 b/flang/test/Lower/Intrinsics/bessel_jn_real16.f90
new file mode 100644
index 000000000000..1ec9cc719a41
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/bessel_jn_real16.f90
@@ -0,0 +1,10 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAJnF128({{.*}}){{.*}}: (i32, f128) -> f128
+  integer :: n
+  real(16) :: a, b
+  b = bessel_jn(n, a)
+end
diff --git a/flang/test/Lower/Intrinsics/bessel_y0_real16.f90 b/flang/test/Lower/Intrinsics/bessel_y0_real16.f90
new file mode 100644
index 000000000000..459d2f4e7315
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/bessel_y0_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAY0F128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = bessel_y0(a)
+end
diff --git a/flang/test/Lower/Intrinsics/bessel_y1_real16.f90 b/flang/test/Lower/Intrinsics/bessel_y1_real16.f90
new file mode 100644
index 000000000000..869b2dc2c961
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/bessel_y1_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAY1F128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = bessel_y1(a)
+end
diff --git a/flang/test/Lower/Intrinsics/bessel_yn_real16.f90 b/flang/test/Lower/Intrinsics/bessel_yn_real16.f90
new file mode 100644
index 000000000000..53be23ab82d1
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/bessel_yn_real16.f90
@@ -0,0 +1,10 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAYnF128({{.*}}){{.*}}: (i32, f128) -> f128
+  integer :: n
+  real(16) :: a, b
+  b = bessel_yn(n, a)
+end
diff --git a/flang/test/Lower/Intrinsics/cabs_real16.f90 b/flang/test/Lower/Intrinsics/cabs_real16.f90
new file mode 100644
index 000000000000..363b154a8e7f
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/cabs_real16.f90
@@ -0,0 +1,10 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranACAbsF128({{.*}}){{.*}}: (!fir.complex<16>) -> f128
+  complex(16) :: a
+  real(16) :: b
+  b = abs(a)
+end
diff --git a/flang/test/Lower/Intrinsics/ceiling_real16.f90 b/flang/test/Lower/Intrinsics/ceiling_real16.f90
new file mode 100644
index 000000000000..21dc221d6b35
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/ceiling_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranACeilF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = ceiling(a)
+end
diff --git a/flang/test/Lower/Intrinsics/cos_real16.f90 b/flang/test/Lower/Intrinsics/cos_real16.f90
new file mode 100644
index 000000000000..859d4a5671e8
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/cos_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranACosF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = cos(a)
+end
diff --git a/flang/test/Lower/Intrinsics/cosh_real16.f90 b/flang/test/Lower/Intrinsics/cosh_real16.f90
new file mode 100644
index 000000000000..cab85365661e
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/cosh_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranACoshF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = cosh(a)
+end
diff --git a/flang/test/Lower/Intrinsics/erf_real16.f90 b/flang/test/Lower/Intrinsics/erf_real16.f90
new file mode 100644
index 000000000000..da4081694617
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/erf_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAErfF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = erf(a)
+end
diff --git a/flang/test/Lower/Intrinsics/erfc_real16.f90 b/flang/test/Lower/Intrinsics/erfc_real16.f90
new file mode 100644
index 000000000000..7e3daa27768c
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/erfc_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAErfcF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = erfc(a)
+end
diff --git a/flang/test/Lower/Intrinsics/exp_real16.f90 b/flang/test/Lower/Intrinsics/exp_real16.f90
new file mode 100644
index 000000000000..aed7319a9eb2
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/exp_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAExpF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = exp(a)
+end
diff --git a/flang/test/Lower/Intrinsics/floor_real16.f90 b/flang/test/Lower/Intrinsics/floor_real16.f90
new file mode 100644
index 000000000000..536c14106dd6
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/floor_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAFloorF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = floor(a)
+end
diff --git a/flang/test/Lower/Intrinsics/gamma_real16.f90 b/flang/test/Lower/Intrinsics/gamma_real16.f90
new file mode 100644
index 000000000000..aabf7fb73f0b
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/gamma_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranATgammaF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = gamma(a)
+end
diff --git a/flang/test/Lower/Intrinsics/hypot_real16.f90 b/flang/test/Lower/Intrinsics/hypot_real16.f90
new file mode 100644
index 000000000000..753148ede29c
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/hypot_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAHypotF128({{.*}}){{.*}}: (f128, f128) -> f128
+  real(16) :: a, b
+  b = hypot(a, b)
+end
diff --git a/flang/test/Lower/Intrinsics/log10_real16.f90 b/flang/test/Lower/Intrinsics/log10_real16.f90
new file mode 100644
index 000000000000..3a6e1d1af911
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/log10_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranALog10F128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = log10(a)
+end
diff --git a/flang/test/Lower/Intrinsics/log_gamma_real16.f90 b/flang/test/Lower/Intrinsics/log_gamma_real16.f90
new file mode 100644
index 000000000000..771ec4e5d931
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/log_gamma_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranALgammaF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = log_gamma(a)
+end
diff --git a/flang/test/Lower/Intrinsics/log_real16.f90 b/flang/test/Lower/Intrinsics/log_real16.f90
new file mode 100644
index 000000000000..a57b8cc8e969
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/log_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranALogF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = log(a)
+end
diff --git a/flang/test/Lower/Intrinsics/missing-math-runtime.f90 b/flang/test/Lower/Intrinsics/missing-math-runtime.f90
index ff767ba18faa..699678fcf2bc 100644
--- a/flang/test/Lower/Intrinsics/missing-math-runtime.f90
+++ b/flang/test/Lower/Intrinsics/missing-math-runtime.f90
@@ -1,10 +1,8 @@
-! There is no quad math runtime available in lowering
-! for now. Test that the TODO are emitted correctly.
-! FIXME: the lit config has to flip a feature flag so that
-! the tests can use different checks depending on whether
-! REAL(16) math support is enabled or not.
-! XFAIL: *
-! RUN: bbc -emit-fir %s -o /dev/null 2>&1 | FileCheck %s
+! If the compiler is built without 128-bit float math
+! support, an appropriate error message is emitted.
+! UNSUPPORTED: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o /dev/null >%t 2>&1 || echo
+! RUN: FileCheck %s --input-file=%t
 
  complex(16) :: a
  real(16) :: b
diff --git a/flang/test/Lower/Intrinsics/nint_real16.f90 b/flang/test/Lower/Intrinsics/nint_real16.f90
new file mode 100644
index 000000000000..c4bbacd0347c
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/nint_real16.f90
@@ -0,0 +1,13 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranALroundF128({{.*}}){{.*}}: (f128) -> i32
+! CHECK: fir.call @_FortranALlroundF128({{.*}}){{.*}}: (f128) -> i64
+  real(16) :: a
+  integer(4) :: b
+  integer(8) :: c
+  b = nint(a, 4)
+  c = nint(a, 8)
+end
diff --git a/flang/test/Lower/Intrinsics/norm2.f90 b/flang/test/Lower/Intrinsics/norm2.f90
index f14cad59d5bd..0d125e36f665 100644
--- a/flang/test/Lower/Intrinsics/norm2.f90
+++ b/flang/test/Lower/Intrinsics/norm2.f90
@@ -76,3 +76,19 @@ subroutine norm2_test_dim_3(a,r)
   ! CHECK-DAG:  %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.heap<!fir.array<?x?xf32>>
   ! CHECK-DAG:  fir.freemem %[[addr]]
 end subroutine norm2_test_dim_3
+
+! CHECK-LABEL: func @_QPnorm2_test_real16(
+! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?x?x?xf128>>{{.*}}, %[[arg1:.*]]: !fir.box<!fir.array<?x?xf128>>{{.*}})
+subroutine norm2_test_real16(a,r)
+  real(16) :: a(:,:,:)
+  real(16) :: r(:,:)
+  ! CHECK-DAG:  %[[dim:.*]] = arith.constant 3 : i32
+  ! CHECK-DAG:  %[[r:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xf128>>>
+  ! CHECK-DAG:  %[[res:.*]] = fir.convert %[[r]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf128>>>>) -> !fir.ref<!fir.box<none>>
+  ! CHECK:  %[[arr:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x?x?xf128>>) -> !fir.box<none>
+  r = norm2(a,dim=3)
+  ! CHECK:  %{{.*}} = fir.call @_FortranANorm2DimReal16(%[[res]], %[[arr]], %[[dim]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+  ! CHECK:  %[[box:.*]] = fir.load %[[r]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf128>>>>
+  ! CHECK-DAG:  %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.array<?x?xf128>>>) -> !fir.heap<!fir.array<?x?xf128>>
+  ! CHECK-DAG:  fir.freemem %[[addr]]
+end subroutine norm2_test_real16
diff --git a/flang/test/Lower/Intrinsics/pow_real16.f90 b/flang/test/Lower/Intrinsics/pow_real16.f90
new file mode 100644
index 000000000000..869422381401
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/pow_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAPowF128({{.*}}){{.*}}: (f128, f128) -> f128
+  real(16) :: a, b
+  b = a ** b
+end
diff --git a/flang/test/Lower/Intrinsics/powi_real16.f90 b/flang/test/Lower/Intrinsics/powi_real16.f90
new file mode 100644
index 000000000000..9e7d0f828b5c
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/powi_real16.f90
@@ -0,0 +1,22 @@
+! RUN: bbc -emit-fir %s -o - | FileCheck %s --check-prefix=CHECK-FAST
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s --check-prefix=CHECK-PRECISE
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s --check-prefix=CHECK-FAST
+
+! CHECK-PRECISE: fir.call @_FortranAFPow16i({{.*}}){{.*}}: (f128, i32) -> f128
+! CHECK-PRECISE: fir.call @_FortranAFPow16i({{.*}}){{.*}}: (f128, i32) -> f128
+! CHECK-PRECISE: fir.call @_FortranAFPow16i({{.*}}){{.*}}: (f128, i32) -> f128
+! CHECK-PRECISE: fir.call @_FortranAFPow16k({{.*}}){{.*}}: (f128, i64) -> f128
+! CHECK-FAST: math.fpowi {{.*}} : f128, i32
+! CHECK-FAST: math.fpowi {{.*}} : f128, i32
+! CHECK-FAST: math.fpowi {{.*}} : f128, i32
+! CHECK-FAST: math.fpowi {{.*}} : f128, i64
+  real(16) :: a
+  integer(1) :: e1
+  integer(2) :: e2
+  integer(4) :: e3
+  integer(8) :: e4
+  a = a ** e1
+  a = a ** e2
+  a = a ** e3
+  a = a ** e4
+end
diff --git a/flang/test/Lower/Intrinsics/random.f90 b/flang/test/Lower/Intrinsics/random.f90
index ca194befd027..4fb1a9a5da27 100644
--- a/flang/test/Lower/Intrinsics/random.f90
+++ b/flang/test/Lower/Intrinsics/random.f90
@@ -45,7 +45,7 @@ subroutine random_test_2
   call foo(size)
   call bar(size, get)
 contains
-  ! CHECK-LABEL: func @_QFrandom_test_2Pfoo
+  ! CHECK-LABEL: func private @_QFrandom_test_2Pfoo
   subroutine foo(size, put, get)
     ! CHECK: [[s1:%[0-9]+]] = fir.is_present %arg0
     ! CHECK: [[s2:%[0-9]+]] = fir.embox %arg0
@@ -70,7 +70,7 @@ contains
     print*, size
   end subroutine
 
-  ! CHECK-LABEL: func @_QFrandom_test_2Pbar
+  ! CHECK-LABEL: func private @_QFrandom_test_2Pbar
   subroutine bar(size, get, put)
     integer, optional :: size
     ! CHECK: [[p1:%[0-9]+]] = fir.is_present %arg2
diff --git a/flang/test/Lower/Intrinsics/sin_real16.f90 b/flang/test/Lower/Intrinsics/sin_real16.f90
new file mode 100644
index 000000000000..d09c71fb03e2
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/sin_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranASinF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = sin(a)
+end
diff --git a/flang/test/Lower/Intrinsics/sinh_real16.f90 b/flang/test/Lower/Intrinsics/sinh_real16.f90
new file mode 100644
index 000000000000..79aa71940aff
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/sinh_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranASinhF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = sinh(a)
+end
diff --git a/flang/test/Lower/Intrinsics/sqrt_real16.f90 b/flang/test/Lower/Intrinsics/sqrt_real16.f90
new file mode 100644
index 000000000000..ac08c8c79e9a
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/sqrt_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranASqrtF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = sqrt(a)
+end
diff --git a/flang/test/Lower/Intrinsics/tan_real16.f90 b/flang/test/Lower/Intrinsics/tan_real16.f90
new file mode 100644
index 000000000000..62aa403fe58c
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/tan_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranATanF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = tan(a)
+end
diff --git a/flang/test/Lower/Intrinsics/tanh_real16.f90 b/flang/test/Lower/Intrinsics/tanh_real16.f90
new file mode 100644
index 000000000000..aff7e7b31d75
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/tanh_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranATanhF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = tanh(a)
+end
diff --git a/flang/test/Lower/Intrinsics/ubound01.f90 b/flang/test/Lower/Intrinsics/ubound01.f90
index 4ebe3870cf0b..df51d79eb6af 100644
--- a/flang/test/Lower/Intrinsics/ubound01.f90
+++ b/flang/test/Lower/Intrinsics/ubound01.f90
@@ -16,7 +16,7 @@ contains
   End Subroutine
 end
 
-! CHECK-LABEL: func.func @_QFPs2
+! CHECK-LABEL: func.func private @_QFPs2
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x?xf32>>
 ! CHECK: %[[BOX:.*]] = fir.rebox %[[ARG0]](%{{.*}}) : (!fir.box<!fir.array<?x?xf32>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?xf32>>
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[BOX]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
diff --git a/flang/test/Lower/OpenACC/acc-routine04.f90 b/flang/test/Lower/OpenACC/acc-routine04.f90
index b5f5aa2ca488..2339c23eaaf8 100644
--- a/flang/test/Lower/OpenACC/acc-routine04.f90
+++ b/flang/test/Lower/OpenACC/acc-routine04.f90
@@ -31,4 +31,4 @@ end program
 ! CHECK: acc.routine @acc_routine_0 func(@_QMdummy_modPsub1) seq
 ! CHECK: func.func @_QMdummy_modPsub1(%arg0: !fir.ref<i32> {fir.bindc_name = "i"}) attributes {acc.routine_info = #acc.routine_info<[@acc_routine_0]>}
 ! CHECK: func.func @_QQmain() attributes {fir.bindc_name = "test_acc_routine"}
-! CHECK: func.func @_QFPsub2() attributes {acc.routine_info = #acc.routine_info<[@acc_routine_1]>}
+! CHECK: func.func private @_QFPsub2() attributes {acc.routine_info = #acc.routine_info<[@acc_routine_1]>, llvm.linkage = #llvm.linkage<internal>}
diff --git a/flang/test/Lower/OpenMP/FIR/delayed-privatization-firstprivate.f90 b/flang/test/Lower/OpenMP/FIR/delayed-privatization-firstprivate.f90
new file mode 100644
index 000000000000..122542345f10
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/delayed-privatization-firstprivate.f90
@@ -0,0 +1,29 @@
+! Test delayed privatization for the `private` clause.
+
+! RUN: bbc -emit-fir -hlfir=false -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 | FileCheck %s
+
+subroutine delayed_privatization_firstprivate
+  implicit none
+  integer :: var1
+
+!$OMP PARALLEL FIRSTPRIVATE(var1)
+  var1 = 10
+!$OMP END PARALLEL
+end subroutine
+
+! CHECK-LABEL: omp.private {type = firstprivate}
+! CHECK-SAME: @[[VAR1_PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<i32>):
+! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_firstprivateEvar1"}
+! CHECK-NEXT:   omp.yield(%[[PRIV_ALLOC]] : !fir.ref<i32>)
+! CHECK: } copy {
+! CHECK: ^bb0(%[[PRIV_ORIG_ARG:.*]]: !fir.ref<i32>, %[[PRIV_PRIV_ARG:.*]]: !fir.ref<i32>):
+! CHECK:    %[[ORIG_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG]] : !fir.ref<i32>
+! CHECK:    fir.store %[[ORIG_VAL]] to %[[PRIV_PRIV_ARG]] : !fir.ref<i32>
+! CHECK:    omp.yield(%[[PRIV_PRIV_ARG]] : !fir.ref<i32>)
+! CHECK: }
+
+! CHECK-LABEL: @_QPdelayed_privatization_firstprivate
+! CHECK: omp.parallel private(@[[VAR1_PRIVATIZER_SYM]] %{{.*}} -> %{{.*}} : !fir.ref<i32>) {
+! CHECK: omp.terminator
+
diff --git a/flang/test/Lower/OpenMP/FIR/delayed-privatization-private.f90 b/flang/test/Lower/OpenMP/FIR/delayed-privatization-private.f90
new file mode 100644
index 000000000000..2e9995ea1fd4
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/delayed-privatization-private.f90
@@ -0,0 +1,38 @@
+! Test delayed privatization for the `private` clause.
+
+! RUN: bbc -emit-fir -hlfir=false -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 | FileCheck %s
+
+subroutine delayed_privatization_private
+  implicit none
+  integer :: var1
+
+!$OMP PARALLEL PRIVATE(var1)
+  var1 = 10
+!$OMP END PARALLEL
+
+!$OMP PARALLEL PRIVATE(var1)
+  var1 = 20
+!$OMP END PARALLEL
+
+end subroutine
+
+! CHECK-LABEL: omp.private {type = private}
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<i32>):
+! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_privateEvar1"}
+! CHECK-NEXT:   omp.yield(%[[PRIV_ALLOC]] : !fir.ref<i32>)
+! CHECK-NOT: } copy {
+
+! CHECK-LABEL: @_QPdelayed_privatization_private
+! CHECK: %[[ORIG_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", uniq_name = "_QFdelayed_privatization_privateEvar1"}
+! CHECK: omp.parallel private(@[[PRIVATIZER_SYM]] %[[ORIG_ALLOC]] -> %[[PAR_ARG:.*]] : !fir.ref<i32>) {
+! CHECK: %[[C10:.*]] = arith.constant 10 : i32
+! CHECK: fir.store %[[C10]] to %[[PAR_ARG]] : !fir.ref<i32>
+! CHECK: omp.terminator
+
+! Test that the same privatizer is used if the a variable with the same type and
+! name was previously privatized.
+! CHECK: omp.parallel private(@[[PRIVATIZER_SYM]] %[[ORIG_ALLOC]] -> %[[PAR_ARG:.*]] : !fir.ref<i32>) {
+! CHECK: %[[C20:.*]] = arith.constant 20 : i32
+! CHECK: fir.store %[[C20]] to %[[PAR_ARG]] : !fir.ref<i32>
+! CHECK: omp.terminator
diff --git a/flang/test/Lower/OpenMP/FIR/threadprivate-use-association-2.f90 b/flang/test/Lower/OpenMP/FIR/threadprivate-use-association-2.f90
index 14c0dff8da4b..6db5735c21f1 100644
--- a/flang/test/Lower/OpenMP/FIR/threadprivate-use-association-2.f90
+++ b/flang/test/Lower/OpenMP/FIR/threadprivate-use-association-2.f90
@@ -17,7 +17,7 @@ end
 ! CHECK:         return
 ! CHECK:       }
 !
-! CHECK-LABEL: func.func @_QMm2FtestPinternal_test() {
+! CHECK-LABEL: func.func private @_QMm2FtestPinternal_test() {{.*}} {
 ! CHECK:         %[[VAL_0:.*]] = fir.address_of(@_QMmEx) : !fir.ref<i32>
 ! CHECK:         %[[VAL_1:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<i32> -> !fir.ref<i32>
 ! CHECK:         fir.call @_QPbar(%[[VAL_1]]) {{.*}}: (!fir.ref<i32>) -> ()
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-firstprivate.f90 b/flang/test/Lower/OpenMP/delayed-privatization-firstprivate.f90
new file mode 100644
index 000000000000..e3d2a5a8af26
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-firstprivate.f90
@@ -0,0 +1,29 @@
+! Test delayed privatization for the `firstprivate` clause.
+
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 | FileCheck %s
+
+subroutine delayed_privatization_firstprivate
+  implicit none
+  integer :: var1
+
+!$omp parallel firstprivate(var1)
+  var1 = 10
+!$omp end parallel
+end subroutine
+
+! CHECK-LABEL: omp.private {type = firstprivate}
+! CHECK-SAME: @[[VAR1_PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<i32>):
+! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_firstprivateEvar1"}
+! CHECK-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]] {uniq_name = "_QFdelayed_privatization_firstprivateEvar1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK-NEXT:   omp.yield(%[[PRIV_DECL]]#0 : !fir.ref<i32>)
+! CHECK: } copy {
+! CHECK: ^bb0(%[[PRIV_ORIG_ARG:.*]]: !fir.ref<i32>, %[[PRIV_PRIV_ARG:.*]]: !fir.ref<i32>):
+! CHECK:    %[[ORIG_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG]] : !fir.ref<i32>
+! CHECK:    hlfir.assign %[[ORIG_VAL]] to %[[PRIV_PRIV_ARG]] temporary_lhs : i32, !fir.ref<i32>
+! CHECK:    omp.yield(%[[PRIV_PRIV_ARG]] : !fir.ref<i32>)
+! CHECK: }
+
+! CHECK-LABEL: @_QPdelayed_privatization_firstprivate
+! CHECK: omp.parallel private(@[[VAR1_PRIVATIZER_SYM]] %{{.*}} -> %{{.*}} : !fir.ref<i32>) {
+! CHECK: omp.terminator
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-private-firstprivate.f90 b/flang/test/Lower/OpenMP/delayed-privatization-private-firstprivate.f90
new file mode 100644
index 000000000000..46eef6eb3bcf
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-private-firstprivate.f90
@@ -0,0 +1,34 @@
+! Test delayed privatization for both `private` and `firstprivate` clauses.
+
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 | FileCheck %s
+
+subroutine delayed_privatization_private_firstprivate
+  implicit none
+  integer :: var1
+  integer :: var2
+
+!$omp parallel private(var1) firstprivate(var2)
+  var1 = 10
+  var2 = var1 + var2
+!$omp end parallel
+end subroutine
+
+! CHECK-LABEL: omp.private {type = firstprivate}
+! CHECK-SAME: @[[VAR2_PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+! CHECK: } copy {
+! CHECK: }
+
+! CHECK-LABEL: omp.private {type = private}
+! CHECK-SAME: @[[VAR1_PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+! CHECK: }
+
+! CHECK-LABEL: func.func @_QPdelayed_privatization_private_firstprivate() {
+! CHECK:  %[[VAR1_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1"
+! CHECK:  %[[VAR1_DECL:.*]]:2 = hlfir.declare %[[VAR1_ALLOC]]
+
+! CHECK:  %[[VAR2_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var2"
+! CHECK:  %[[VAR2_DECL:.*]]:2 = hlfir.declare %[[VAR2_ALLOC]]
+
+! CHECK:  omp.parallel private(
+! CHECK-SAME: @[[VAR1_PRIVATIZER_SYM]] %[[VAR1_DECL]]#0 -> %{{.*}} : !fir.ref<i32>, 
+! CHECK-SAME: @[[VAR2_PRIVATIZER_SYM]] %[[VAR2_DECL]]#0 -> %{{.*}} : !fir.ref<i32>) {
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-private.f90 b/flang/test/Lower/OpenMP/delayed-privatization-private.f90
new file mode 100644
index 000000000000..240e0e71bfcd
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-private.f90
@@ -0,0 +1,28 @@
+! Test delayed privatization for the `private` clause.
+
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 | FileCheck %s
+
+subroutine delayed_privatization_private
+  implicit none
+  integer :: var1
+
+!$omp parallel private(var1)
+  var1 = 10
+!$omp end parallel
+end subroutine
+
+! CHECK-LABEL: omp.private {type = private}
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<i32>):
+! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_privateEvar1"}
+! CHECK-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]] {uniq_name = "_QFdelayed_privatization_privateEvar1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK-NEXT:   omp.yield(%[[PRIV_DECL]]#0 : !fir.ref<i32>)
+! CHECK-NOT: } copy {
+
+! CHECK-LABEL: @_QPdelayed_privatization_private
+! CHECK: %[[ORIG_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", uniq_name = "_QFdelayed_privatization_privateEvar1"}
+! CHECK: %[[ORIG_DECL:.*]]:2 = hlfir.declare %[[ORIG_ALLOC]] {uniq_name = "_QFdelayed_privatization_privateEvar1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: omp.parallel private(@[[PRIVATIZER_SYM]] %[[ORIG_DECL]]#0 -> %[[PAR_ARG:.*]] : !fir.ref<i32>) {
+! CHECK: %[[PAR_ARG_DECL:.*]]:2 = hlfir.declare %[[PAR_ARG]] {uniq_name = "_QFdelayed_privatization_privateEvar1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: hlfir.assign %{{.*}} to %[[PAR_ARG_DECL]]#0 : i32, !fir.ref<i32>
+! CHECK: omp.terminator
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-reduction.f90 b/flang/test/Lower/OpenMP/delayed-privatization-reduction.f90
new file mode 100644
index 000000000000..c61f352b9b05
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-reduction.f90
@@ -0,0 +1,30 @@
+! Test that reductions and delayed privatization work properly togehter. Since
+! both types of clauses add block arguments to the OpenMP region, we make sure
+! that the block arguments are added in the proper order (reductions first and
+! then delayed privatization.
+
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 | FileCheck %s
+
+subroutine red_and_delayed_private
+    integer :: red
+    integer :: prv
+
+    red = 0
+    prv = 10
+
+    !$omp parallel reduction(+:red) private(prv)
+    red = red + 1
+    prv = 20
+    !$omp end parallel
+end subroutine
+
+! CHECK-LABEL: omp.private {type = private}
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+
+! CHECK-LABEL: omp.reduction.declare
+! CHECK-SAME: @[[REDUCTION_SYM:.*]] : i32 init
+
+! CHECK-LABEL: _QPred_and_delayed_private
+! CHECK: omp.parallel
+! CHECK-SAME: reduction(@[[REDUCTION_SYM]] %{{.*}} -> %arg0 : !fir.ref<i32>)
+! CHECK-SAME: private(@[[PRIVATIZER_SYM]] %{{.*}} -> %arg1 : !fir.ref<i32>) {
diff --git a/flang/test/Lower/OpenMP/threadprivate-commonblock-use.f90 b/flang/test/Lower/OpenMP/threadprivate-commonblock-use.f90
index 28616f7595a0..71f1c7608a2c 100644
--- a/flang/test/Lower/OpenMP/threadprivate-commonblock-use.f90
+++ b/flang/test/Lower/OpenMP/threadprivate-commonblock-use.f90
@@ -15,7 +15,7 @@ contains
   subroutine ss1
     use m0
   contains
-!CHECK-LABEL: func @_QMm1Fss1Pss2
+!CHECK-LABEL: func private @_QMm1Fss1Pss2
 !CHECK: %[[CMN:.*]] = fir.address_of(@cmn_) : !fir.ref<!fir.array<4xi8>>
 !CHECK: omp.parallel
 !CHECK: %{{.*}} = omp.threadprivate %[[CMN]] : !fir.ref<!fir.array<4xi8>> -> !fir.ref<!fir.array<4xi8>>
diff --git a/flang/test/Lower/OpenMP/threadprivate-use-association-2-hlfir.f90 b/flang/test/Lower/OpenMP/threadprivate-use-association-2-hlfir.f90
index 722f023fbefc..79a1ce9897f2 100644
--- a/flang/test/Lower/OpenMP/threadprivate-use-association-2-hlfir.f90
+++ b/flang/test/Lower/OpenMP/threadprivate-use-association-2-hlfir.f90
@@ -19,7 +19,7 @@ end
 ! CHECK:         return
 ! CHECK:       }
 
-! CHECK-LABEL: func.func @_QMm2FtestPinternal_test() {
+! CHECK-LABEL: func.func private @_QMm2FtestPinternal_test() {{.*}} {
 ! CHECK:         %[[VAL_0:.*]] = fir.address_of(@_QMmEx) : !fir.ref<i32>
 ! CHECK:         %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QMmEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:         %[[VAL_2:.*]] = omp.threadprivate %[[VAL_1]]#1 : !fir.ref<i32> -> !fir.ref<i32>
diff --git a/flang/test/Lower/PowerPC/ppc-vector-types.f90 b/flang/test/Lower/PowerPC/ppc-vector-types.f90
index be293f873ecb..4745e4567b2d 100644
--- a/flang/test/Lower/PowerPC/ppc-vector-types.f90
+++ b/flang/test/Lower/PowerPC/ppc-vector-types.f90
@@ -44,7 +44,7 @@
       ! CHECK-LLVM-NEXT: store <512 x i1> %[[RESQ]], ptr @_QFEvq2, align 64
 
       contains
-      ! CHECK-LLVM-LABEL: define <4 x i32> @_QFPtest_vec_integer_assign
+      ! CHECK-LLVM-LABEL: define internal <4 x i32> @_QFPtest_vec_integer_assign
       function test_vec_integer_assign(arg1)
         ! CHECK-LLVM: %[[FUNC_RES:.*]] = alloca <4 x i32>, i64 1, align 16
         vector(integer(4)) :: arg1, test_vec_integer_assign
@@ -58,7 +58,7 @@
         ! CHECK-LLVM-NEXT: ret <4 x i32> %[[RET]]
       end function test_vec_integer_assign
 
-      ! CHECK-LLVM-LABEL: define <2 x double> @_QFPtest_vec_real_assign
+      ! CHECK-LLVM-LABEL: define internal <2 x double> @_QFPtest_vec_real_assign
       function test_vec_real_assign(arg1)
         ! CHECK-LLVM: %[[FUNC_RES:.*]] = alloca <2 x double>, i64 1, align 16
         vector(real(8)) :: arg1, test_vec_real_assign
@@ -72,7 +72,7 @@
         ! CHECK-LLVM-NEXT: ret <2 x double> %[[RET]]
       end function test_vec_real_assign
 
-      ! CHECK-LLVM-LABEL: define <8 x i16> @_QFPtest_vec_unsigned_assign
+      ! CHECK-LLVM-LABEL: define internal <8 x i16> @_QFPtest_vec_unsigned_assign
       function test_vec_unsigned_assign(arg1)
         ! CHECK-LLVM: %[[FUNC_RES:.*]] = alloca <8 x i16>, i64 1, align 16
         vector(unsigned(2)) :: arg1, test_vec_unsigned_assign
@@ -86,7 +86,7 @@
         ! CHECK-LLVM-NEXT: ret <8 x i16> %[[RET]]
       end function test_vec_unsigned_assign
 
-      ! CHECK-LLVM-LABEL: define <256 x i1> @_QFPtest_vec_pair_assign
+      ! CHECK-LLVM-LABEL: define internal <256 x i1> @_QFPtest_vec_pair_assign
       function test_vec_pair_assign(arg1)
         ! CHECK-LLVM: %[[FUNC_RES:.*]] = alloca <256 x i1>, i64 1, align 32
         __vector_pair :: arg1, test_vec_pair_assign
@@ -100,7 +100,7 @@
         ! CHECK-LLVM-NEXT: ret <256 x i1> %[[RET]]
       end function test_vec_pair_assign
 
-      ! CHECK-LLVM-LABEL: define <512 x i1> @_QFPtest_vec_quad_assign
+      ! CHECK-LLVM-LABEL: define internal <512 x i1> @_QFPtest_vec_quad_assign
       function test_vec_quad_assign(arg1)
         ! CHECK-LLVM: %[[FUNC_RES:.*]] = alloca <512 x i1>, i64 1, align 64
         __vector_quad :: arg1, test_vec_quad_assign
diff --git a/flang/test/Lower/array-temp.f90 b/flang/test/Lower/array-temp.f90
index 347d4cef78bc..10c5ee91d44b 100644
--- a/flang/test/Lower/array-temp.f90
+++ b/flang/test/Lower/array-temp.f90
@@ -404,7 +404,7 @@ subroutine tt1
   ! CHECK-NEXT: fir.call @_FortranAioEndIoStatement
   print*, [(r([7.0]),i=1,3)]
 contains
-  ! CHECK-LABEL: func @_QFtt1Pr
+  ! CHECK-LABEL: func private @_QFtt1Pr
   function r(x)
     real x(:)
     r = x(1)
diff --git a/flang/test/Lower/dummy-arguments.f90 b/flang/test/Lower/dummy-arguments.f90
index 43d8e3c1e5d4..331e089a60fa 100644
--- a/flang/test/Lower/dummy-arguments.f90
+++ b/flang/test/Lower/dummy-arguments.f90
@@ -9,7 +9,7 @@ program test1
   call foo(10)
 contains
 
-! CHECK-LABEL: func @_QFPfoo
+! CHECK-LABEL: func private @_QFPfoo
 subroutine foo(avar1)
   integer :: avar1
 !  integer :: my_data, my_data2
diff --git a/flang/test/Lower/dummy-procedure-character.f90 b/flang/test/Lower/dummy-procedure-character.f90
index cecd839287ed..72d548513fb2 100644
--- a/flang/test/Lower/dummy-procedure-character.f90
+++ b/flang/test/Lower/dummy-procedure-character.f90
@@ -213,7 +213,7 @@ subroutine host(f)
   ! CHECK: fir.call @_QFhostPintern(%[[VAL_1]])
   call intern()
 contains
-! CHECK-LABEL: func @_QFhostPintern(
+! CHECK-LABEL: func private @_QFhostPintern(
 ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>>> {fir.host_assoc})
   subroutine intern()
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
@@ -242,7 +242,7 @@ subroutine host2(f)
   ! CHECK: fir.call @_QFhost2Pintern(%[[VAL_1]])
   call intern()
 contains
-! CHECK-LABEL: func @_QFhost2Pintern(
+! CHECK-LABEL: func private @_QFhost2Pintern(
 ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>>> {fir.host_assoc})
   subroutine intern()
     ! CHECK:  %[[VAL_1:.*]] = fir.alloca !fir.char<1,42> {bindc_name = ".result"}
diff --git a/flang/test/Lower/equivalence-with-host-assoc.f90 b/flang/test/Lower/equivalence-with-host-assoc.f90
index ec84fb506314..0ffb1bc5bf9e 100644
--- a/flang/test/Lower/equivalence-with-host-assoc.f90
+++ b/flang/test/Lower/equivalence-with-host-assoc.f90
@@ -10,7 +10,7 @@ contains
     i1 = j1
   end subroutine inner
 end subroutine test1
-! FIR-LABEL:   func.func @_QFtest1Pinner() attributes {fir.internal_proc} {
+! FIR-LABEL:   func.func private @_QFtest1Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! FIR:           %[[VAL_0:.*]] = fir.address_of(@_QFtest1Ei1) : !fir.ref<!fir.array<1xi32>>
 ! FIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<1xi32>>) -> !fir.ref<!fir.array<4xi8>>
 ! FIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -24,7 +24,7 @@ end subroutine test1
 ! FIR:           return
 ! FIR:         }
 
-! HLFIR-LABEL:   func.func @_QFtest1Pinner() attributes {fir.internal_proc} {
+! HLFIR-LABEL:   func.func private @_QFtest1Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! HLFIR:           %[[VAL_0:.*]] = fir.address_of(@_QFtest1Ei1) : !fir.ref<!fir.array<1xi32>>
 ! HLFIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<1xi32>>) -> !fir.ref<!fir.array<4xi8>>
 ! HLFIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -54,7 +54,7 @@ contains
     end subroutine inner
   end subroutine host
 end module test2
-! FIR-LABEL:   func.func @_QMtest2FhostPinner() attributes {fir.internal_proc} {
+! FIR-LABEL:   func.func private @_QMtest2FhostPinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! FIR:           %[[VAL_0:.*]] = fir.address_of(@_QMtest2FhostEf1) : !fir.ref<!fir.array<1xi32>>
 ! FIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<1xi32>>) -> !fir.ref<!fir.array<4xi8>>
 ! FIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -68,7 +68,7 @@ end module test2
 ! FIR:           return
 ! FIR:         }
 
-! HLFIR-LABEL:   func.func @_QMtest2FhostPinner() attributes {fir.internal_proc} {
+! HLFIR-LABEL:   func.func private @_QMtest2FhostPinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! HLFIR:           %[[VAL_0:.*]] = fir.address_of(@_QMtest2FhostEf1) : !fir.ref<!fir.array<1xi32>>
 ! HLFIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<1xi32>>) -> !fir.ref<!fir.array<4xi8>>
 ! HLFIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -94,7 +94,7 @@ contains
     i1 = j1 + k1
   end subroutine inner
 end subroutine test3
-! FIR-LABEL:   func.func @_QFtest3Pinner() attributes {fir.internal_proc} {
+! FIR-LABEL:   func.func private @_QFtest3Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! FIR:           %[[VAL_0:.*]] = fir.address_of(@blk_) : !fir.ref<tuple<i32>>
 ! FIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<tuple<i32>>) -> !fir.ref<!fir.array<?xi8>>
 ! FIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -115,7 +115,7 @@ end subroutine test3
 ! FIR:           return
 ! FIR:         }
 
-! HLFIR-LABEL:   func.func @_QFtest3Pinner() attributes {fir.internal_proc} {
+! HLFIR-LABEL:   func.func private @_QFtest3Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! HLFIR:           %[[VAL_0:.*]] = fir.address_of(@blk_) : !fir.ref<tuple<i32>>
 ! HLFIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<tuple<i32>>) -> !fir.ref<!fir.array<?xi8>>
 ! HLFIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -149,7 +149,7 @@ contains
     i1 = j1 + k1
   end subroutine inner
 end subroutine test4
-! FIR-LABEL:   func.func @_QFtest4Pinner() attributes {fir.internal_proc} {
+! FIR-LABEL:   func.func private @_QFtest4Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! FIR:           %[[VAL_0:.*]] = fir.address_of(@blk_) : !fir.ref<tuple<i32>>
 ! FIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<tuple<i32>>) -> !fir.ref<!fir.array<?xi8>>
 ! FIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -170,7 +170,7 @@ end subroutine test4
 ! FIR:           return
 ! FIR:         }
 
-! HLFIR-LABEL:   func.func @_QFtest4Pinner() attributes {fir.internal_proc} {
+! HLFIR-LABEL:   func.func private @_QFtest4Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! HLFIR:           %[[VAL_0:.*]] = fir.address_of(@blk_) : !fir.ref<tuple<i32>>
 ! HLFIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<tuple<i32>>) -> !fir.ref<!fir.array<?xi8>>
 ! HLFIR:           %[[VAL_2:.*]] = arith.constant 0 : index
diff --git a/flang/test/Lower/explicit-interface-results-2.f90 b/flang/test/Lower/explicit-interface-results-2.f90
index 4e4b035bae7e..86aae720e7fc 100644
--- a/flang/test/Lower/explicit-interface-results-2.f90
+++ b/flang/test/Lower/explicit-interface-results-2.f90
@@ -69,8 +69,8 @@ subroutine host4()
   integer :: n
   call internal_proc_a()
 contains
-! CHECK-LABEL: func @_QFhost4Pinternal_proc_a
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFhost4Pinternal_proc_a
+! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine internal_proc_a()
     call takes_array(return_array())
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
@@ -94,7 +94,7 @@ subroutine host5()
   implicit none
   call internal_proc_a()
 contains
-! CHECK-LABEL: func @_QFhost5Pinternal_proc_a() attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFhost5Pinternal_proc_a() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine internal_proc_a()
     call takes_array(return_array())
 ! CHECK:  %[[VAL_0:.*]] = fir.address_of(@_QMsome_moduleEn_module) : !fir.ref<i32>
@@ -115,7 +115,7 @@ subroutine host6()
   implicit none
   call internal_proc_a()
 contains
-! CHECK-LABEL: func @_QFhost6Pinternal_proc_a
+! CHECK-LABEL: func private @_QFhost6Pinternal_proc_a
   subroutine internal_proc_a()
     call takes_array(return_array())
 ! CHECK:  %[[VAL_0:.*]] = fir.address_of(@_QMsome_moduleEn_module) : !fir.ref<i32>
@@ -187,7 +187,7 @@ subroutine host9()
   common /mycom/ n_common
   call internal_proc_a()
 contains
-! CHECK-LABEL: func @_QFhost9Pinternal_proc_a
+! CHECK-LABEL: func private @_QFhost9Pinternal_proc_a
   subroutine internal_proc_a()
 ! CHECK:  %[[VAL_0:.*]] = arith.constant 0 : index
 ! CHECK:  %[[VAL_1:.*]] = fir.address_of(@mycom_) : !fir.ref<!fir.array<4xi8>>
@@ -213,7 +213,7 @@ subroutine host10()
   implicit none
   call internal_proc_a()
 contains
-! CHECK-LABEL: func @_QFhost10Pinternal_proc_a
+! CHECK-LABEL: func private @_QFhost10Pinternal_proc_a
   subroutine internal_proc_a()
     call takes_array(return_array())
 ! CHECK:  %[[VAL_0:.*]] = arith.constant 0 : index
diff --git a/flang/test/Lower/forall/array-constructor.f90 b/flang/test/Lower/forall/array-constructor.f90
index 083a71ba479a..ad21ed33fba2 100644
--- a/flang/test/Lower/forall/array-constructor.f90
+++ b/flang/test/Lower/forall/array-constructor.f90
@@ -114,8 +114,8 @@ end subroutine ac1
 ! CHECK:         return
 ! CHECK:       }
 
-! CHECK-LABEL: func @_QFac1Pfunc(
-! CHECK-SAME:                    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}) -> i32 {
+! CHECK-LABEL: func private @_QFac1Pfunc(
+! CHECK-SAME:                    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}) -> i32 {{.*}} {
 ! CHECK:         %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "func", uniq_name = "_QFac1FfuncEfunc"}
 ! CHECK:         %[[VAL_2:.*]] = arith.constant 1 : i64
 ! CHECK:         %[[VAL_3:.*]] = arith.constant 1 : i64
@@ -259,8 +259,8 @@ end subroutine ac2
 ! CHECK:         return
 ! CHECK:       }
 
-! CHECK-LABEL: func @_QFac2Pfunc(
-! CHECK-SAME:                    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}) -> !fir.array<3xi32> {
+! CHECK-LABEL: func private @_QFac2Pfunc(
+! CHECK-SAME:                    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}) -> !fir.array<3xi32> {{.*}} {
 ! CHECK:         %[[VAL_1:.*]] = arith.constant 3 : index
 ! CHECK:         %[[VAL_2:.*]] = fir.alloca !fir.array<3xi32> {bindc_name = "func", uniq_name = "_QFac2FfuncEfunc"}
 ! CHECK:         %[[VAL_3:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
diff --git a/flang/test/Lower/forall/character-1.f90 b/flang/test/Lower/forall/character-1.f90
index e5c40a16420a..e97c3f36b0b1 100644
--- a/flang/test/Lower/forall/character-1.f90
+++ b/flang/test/Lower/forall/character-1.f90
@@ -17,7 +17,7 @@ contains
   end subroutine sub
 end program test
 
-! CHECK-LABEL: define void @_QFPsub(
+! CHECK-LABEL: define internal void @_QFPsub(
 ! CHECK-SAME:    ptr %[[arg:.*]])
 ! CHECK: %[[extent:.*]] = getelementptr { {{.*}}, [1 x [3 x i64]] }, ptr %[[arg]], i32 0, i32 7, i64 0, i32 1
 ! CHECK: %[[extval:.*]] = load i64, ptr %[[extent]]
diff --git a/flang/test/Lower/global-initialization.f90 b/flang/test/Lower/global-initialization.f90
index dd60a6fd8b9f..ff208ecc3c89 100644
--- a/flang/test/Lower/global-initialization.f90
+++ b/flang/test/Lower/global-initialization.f90
@@ -4,16 +4,19 @@ program bar
 ! CHECK: fir.address_of(@[[name1:.*]]my_data)
   integer, save :: my_data = 1
   print *, my_data
+  call foo()
+  call foo2()
+  call foo3()
 contains
 
-! CHECK-LABEL: func @_QFPfoo
+! CHECK-LABEL: func private @_QFPfoo
 subroutine foo()
 ! CHECK: fir.address_of(@[[name2:.*foo.*my_data]])
   integer, save :: my_data = 2
   print *, my_data + 1
 end subroutine
 
-! CHECK-LABEL: func @_QFPfoo2
+! CHECK-LABEL: func private @_QFPfoo2
 subroutine foo2()
 ! CHECK: fir.address_of(@[[name3:.*foo2.*my_data]])
   integer, save :: my_data
@@ -21,7 +24,7 @@ subroutine foo2()
   print *, my_data
 end subroutine
 
-! CHECK-LABEL: func @_QFPfoo3
+! CHECK-LABEL: func private @_QFPfoo3
 subroutine foo3()
 ! CHECK-DAG: fir.address_of(@[[name4:.*foo3.*idata]]){{.*}}fir.array<5xi32>
 ! CHECK-DAG: fir.address_of(@[[name5:.*foo3.*rdata]]){{.*}}fir.array<3xf16>
diff --git a/flang/test/Lower/host-associated-functions.f90 b/flang/test/Lower/host-associated-functions.f90
index 77a51490950f..78d081748c2f 100644
--- a/flang/test/Lower/host-associated-functions.f90
+++ b/flang/test/Lower/host-associated-functions.f90
@@ -19,8 +19,8 @@ subroutine capture_char_func_dummy(char_func_dummy, n)
   ! CHECK:  fir.call @_QFcapture_char_func_dummyPinternal(%[[VAL_2]]) {{.*}}: (!fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>, !fir.ref<i32>>>) -> ()
   call internal()
 contains
-  ! CHECK-LABEL: func @_QFcapture_char_func_dummyPinternal(
-  ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>, !fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK-LABEL: func private @_QFcapture_char_func_dummyPinternal(
+  ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>, !fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine internal()
   ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
   ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>, !fir.ref<i32>>>, i32) -> !fir.ref<tuple<!fir.boxproc<() -> ()>, i64>>
@@ -55,8 +55,8 @@ subroutine capture_char_func_assumed_dummy(char_func_dummy)
 ! CHECK:  fir.call @_QFcapture_char_func_assumed_dummyPinternal(%[[VAL_1]]) {{.*}}: (!fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>>>) -> ()
   call internal()
 contains
-! CHECK-LABEL: func @_QFcapture_char_func_assumed_dummyPinternal(
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFcapture_char_func_assumed_dummyPinternal(
+! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine internal()
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>>>, i32) -> !fir.ref<tuple<!fir.boxproc<() -> ()>, i64>>
@@ -84,7 +84,7 @@ subroutine capture_char_func(n)
 ! CHECK:  fir.call @_QFcapture_char_funcPinternal(%[[VAL_1]]) {{.*}}: (!fir.ref<tuple<!fir.ref<i32>>>) -> ()
   call internal()
 contains
-! CHECK-LABEL: func @_QFcapture_char_funcPinternal(
+! CHECK-LABEL: func private @_QFcapture_char_funcPinternal(
 ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc})
   subroutine internal()
    print *, char_func()
@@ -109,8 +109,8 @@ subroutine capture_array_func(n)
   call internal()
 contains
   subroutine internal()
-! CHECK-LABEL: func @_QFcapture_array_funcPinternal(
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFcapture_array_funcPinternal(
+! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<tuple<!fir.ref<i32>>>, i32) -> !fir.llvm_ptr<!fir.ref<i32>>
 ! CHECK:  %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.llvm_ptr<!fir.ref<i32>>
@@ -146,7 +146,7 @@ subroutine use_module()
 ! CHECK:  fir.call @_QFuse_modulePinternal() {{.*}}: () -> ()
   call internal()
   contains
-! CHECK-LABEL: func @_QFuse_modulePinternal() {
+! CHECK-LABEL: func private @_QFuse_modulePinternal() {{.*}} {
   subroutine internal()
     print *, return_char(42)
   end subroutine
diff --git a/flang/test/Lower/host-associated-globals.f90 b/flang/test/Lower/host-associated-globals.f90
index bb22a3277542..fe612e777aea 100644
--- a/flang/test/Lower/host-associated-globals.f90
+++ b/flang/test/Lower/host-associated-globals.f90
@@ -18,7 +18,7 @@ contains
     print *, j_in_equiv, not_in_equiv
  end subroutine
 end subroutine
-! CHECK-LABEL: func.func @_QFmodule_varPbar()
+! CHECK-LABEL: func.func private @_QFmodule_varPbar()
 ! CHECK:  %[[VAL_0:.*]] = fir.address_of(@_QMtest_mod_used_in_hostEi) : !fir.ref<!fir.array<4xi8>>
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : index
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<!fir.array<4xi8>>, index) -> !fir.ref<i8>
@@ -37,7 +37,7 @@ contains
     print *, j_in_equiv, not_in_equiv
  end subroutine
 end subroutine
-! CHECK-LABEL: func.func @_QFtest_commonPbar() attributes {fir.internal_proc} {
+! CHECK-LABEL: func.func private @_QFtest_commonPbar() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_0:.*]] = fir.address_of(@x_) : !fir.ref<!fir.array<12xi8>>
 ! CHECK:  %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<12xi8>>) -> !fir.ref<!fir.array<?xi8>>
 ! CHECK:  %[[VAL_2:.*]] = arith.constant 4 : index
@@ -59,7 +59,7 @@ contains
     print *, j_in_equiv, not_in_equiv
  end subroutine
 end subroutine
-! CHECK-LABEL: func.func @_QFsaved_equivPbar() attributes {fir.internal_proc} {
+! CHECK-LABEL: func.func private @_QFsaved_equivPbar() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_0:.*]] = fir.address_of(@_QFsaved_equivEi) : !fir.ref<!fir.array<8xi8>>
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 4 : index
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<!fir.array<8xi8>>, index) -> !fir.ref<i8>
@@ -79,8 +79,8 @@ contains
     call test(saved_j, j)
  end subroutine
 end subroutine
-! CHECK-LABEL: func.func @_QFmixed_capturePbar(
-! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func.func private @_QFmixed_capturePbar(
+! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_1:.*]] = fir.address_of(@_QFmixed_captureEsaved_i) : !fir.ref<!fir.array<4xi8>>
 ! CHECK:  %[[VAL_2:.*]] = arith.constant 0 : index
 ! CHECK:  %[[VAL_3:.*]] = fir.coordinate_of %[[VAL_1]], %[[VAL_2]] : (!fir.ref<!fir.array<4xi8>>, index) -> !fir.ref<i8>
diff --git a/flang/test/Lower/host-associated.f90 b/flang/test/Lower/host-associated.f90
index 25e637805e87..f88903c8af80 100644
--- a/flang/test/Lower/host-associated.f90
+++ b/flang/test/Lower/host-associated.f90
@@ -19,8 +19,8 @@ subroutine test1
   call test1_internal
   print *, i
 contains
-  ! CHECK-LABEL: func @_QFtest1Ptest1_internal(
-  ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK-LABEL: func private @_QFtest1Ptest1_internal(
+  ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   ! CHECK: %[[iaddr:.*]] = fir.coordinate_of %[[arg]], %c0
   ! CHECK: %[[i:.*]] = fir.load %[[iaddr]] : !fir.llvm_ptr<!fir.ref<i32>>
   ! CHECK: %[[val:.*]] = fir.call @_QPifoo() {{.*}}: () -> i32
@@ -46,8 +46,8 @@ subroutine test2
   call test2_internal
   print *, a, b
 contains
-  ! CHECK-LABEL: func @_QFtest2Ptest2_internal(
-  ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref<tuple<!fir.ref<f32>, !fir.ref<f32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK-LABEL: func private @_QFtest2Ptest2_internal(
+  ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref<tuple<!fir.ref<f32>, !fir.ref<f32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test2_internal
     ! CHECK: %[[a:.*]] = fir.coordinate_of %[[arg]], %c0
     ! CHECK: %[[aa:.*]] = fir.load %[[a]] : !fir.llvm_ptr<!fir.ref<f32>>
@@ -61,8 +61,8 @@ contains
     call test2_inner
   end subroutine test2_internal
 
-  ! CHECK-LABEL: func @_QFtest2Ptest2_inner(
-  ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref<tuple<!fir.ref<f32>, !fir.ref<f32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK-LABEL: func private @_QFtest2Ptest2_inner(
+  ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref<tuple<!fir.ref<f32>, !fir.ref<f32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test2_inner
     ! CHECK: %[[a:.*]] = fir.coordinate_of %[[arg]], %c0
     ! CHECK: %[[aa:.*]] = fir.load %[[a]] : !fir.llvm_ptr<!fir.ref<f32>>
@@ -95,8 +95,8 @@ subroutine test6(c)
   print *, c
 
 contains
-  ! CHECK-LABEL: func @_QFtest6Ptest6_inner(
-  ! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.boxchar<1>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK-LABEL: func private @_QFtest6Ptest6_inner(
+  ! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.boxchar<1>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test6_inner
     ! CHECK: %[[coor:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.boxchar<1>>>, i32) -> !fir.ref<!fir.boxchar<1>>
     ! CHECK: %[[load:.*]] = fir.load %[[coor]] : !fir.ref<!fir.boxchar<1>>
@@ -137,8 +137,8 @@ subroutine test3(p,q,i)
   end if
   
 contains
-  ! CHECK-LABEL: func @_QFtest3Ptest3_inner(
-  ! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK-LABEL: func private @_QFtest3Ptest3_inner(
+  ! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test3_inner
     ! CHECK: %[[pcoor:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>>>, i32) -> !fir.ref<!fir.box<!fir.array<?xf32>>>
     ! CHECK: %[[p:.*]] = fir.load %[[pcoor]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
@@ -184,8 +184,8 @@ subroutine test3a(p)
   end if
   
 contains
-  ! CHECK: func @_QFtest3aPtest3a_inner(
-  ! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.box<!fir.array<10xf32>>, !fir.box<!fir.array<10xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK: func private @_QFtest3aPtest3a_inner(
+  ! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.box<!fir.array<10xf32>>, !fir.box<!fir.array<10xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test3a_inner
     ! CHECK: %[[pcoor:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.box<!fir.array<10xf32>>, !fir.box<!fir.array<10xf32>>>>, i32) -> !fir.ref<!fir.box<!fir.array<10xf32>>>
     ! CHECK: %[[p:.*]] = fir.load %[[pcoor]] : !fir.ref<!fir.box<!fir.array<10xf32>>>
@@ -228,8 +228,8 @@ subroutine test4
   end if
   
 contains
-  ! CHECK-LABEL: func @_QFtest4Ptest4_inner(
-  ! CHECK-SAME:%[[tup:.*]]: !fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK-LABEL: func private @_QFtest4Ptest4_inner(
+  ! CHECK-SAME:%[[tup:.*]]: !fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test4_inner
     ! CHECK: %[[ptup:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>>>, i32) -> !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<f32>>>>
     ! CHECK: %[[p:.*]] = fir.load %[[ptup]] : !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<f32>>>>
@@ -270,8 +270,8 @@ subroutine test5
   end if
   
 contains
-  ! CHECK-LABEL: func @_QFtest5Ptest5_inner(
-  ! CHECK-SAME:%[[tup:.*]]: !fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK-LABEL: func private @_QFtest5Ptest5_inner(
+  ! CHECK-SAME:%[[tup:.*]]: !fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test5_inner
     ! CHECK: %[[ptup:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>>>, i32) -> !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>>
     ! CHECK: %[[p:.*]] = fir.load %[[ptup]] : !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>>
@@ -308,8 +308,8 @@ subroutine test7(j, k)
   k = test7_inner(k)
 contains
 
-! CHECK-LABEL: func @_QFtest7Ptest7_inner(
-! CHECK-SAME: %[[i:.*]]: !fir.ref<i32>{{.*}}, %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) -> i32 attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFtest7Ptest7_inner(
+! CHECK-SAME: %[[i:.*]]: !fir.ref<i32>{{.*}}, %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) -> i32 attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 elemental integer function test7_inner(i)
   implicit none
   integer, intent(in) :: i
@@ -329,8 +329,8 @@ subroutine issue990()
   integer :: captured
   call bar()
 contains
-! CHECK-LABEL: func @_QFissue990Pbar(
-! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFissue990Pbar(
+! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 subroutine bar()
   integer :: stmt_func, i
   stmt_func(i) = i + captured
@@ -351,8 +351,8 @@ subroutine issue990b()
   captured_stmt_func(i) = i + captured
   call bar()
 contains
-! CHECK-LABEL: func @_QFissue990bPbar(
-! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFissue990bPbar(
+! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 subroutine bar()
   ! CHECK: %[[tupAddr:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.ref<i32>>>, i32) -> !fir.llvm_ptr<!fir.ref<i32>>
   ! CHECK: %[[addr:.*]] = fir.load %[[tupAddr]] : !fir.llvm_ptr<!fir.ref<i32>>
@@ -372,8 +372,8 @@ subroutine test8(dummy_proc)
  end interface
  call bar()
 contains
-! CHECK-LABEL: func @_QFtest8Pbar(
-! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.boxproc<() -> ()>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFtest8Pbar(
+! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.boxproc<() -> ()>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 subroutine bar()
   ! CHECK: %[[tupAddr:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.boxproc<() -> ()>>>, i32) -> !fir.ref<!fir.boxproc<() -> ()>>
   ! CHECK: %[[dummyProc:.*]] = fir.load %[[tupAddr]] : !fir.ref<!fir.boxproc<() -> ()>>
@@ -392,8 +392,8 @@ subroutine test9(dummy_proc)
  end interface
  call bar()
 contains
-! CHECK-LABEL: func @_QFtest9Pbar(
-! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.boxproc<() -> ()>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFtest9Pbar(
+! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.boxproc<() -> ()>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 subroutine bar()
   ! CHECK: %[[tupAddr:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.boxproc<() -> ()>>>, i32) -> !fir.ref<!fir.boxproc<() -> ()>>
   ! CHECK: %[[dummyProc:.*]] = fir.load %[[tupAddr]] : !fir.ref<!fir.boxproc<() -> ()>>
@@ -415,8 +415,8 @@ subroutine test10(i)
  ! CHECK: fir.call @_QFtest10Pbar(%[[tup]]) {{.*}}: (!fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>>>) -> ()
  call bar()
 contains
-! CHECK-LABEL: func @_QFtest10Pbar(
-! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFtest10Pbar(
+! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 subroutine bar()
   ! CHECK: %[[tupAddr:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>>>, i32) -> !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>>
   ! CHECK: fir.load %[[tupAddr]] : !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>>
@@ -433,9 +433,9 @@ end subroutine
 ! CHECK:         %[[VAL_8:.*]] = fir.emboxproc %[[VAL_7]], %[[VAL_5]] : ((!fir.ref<i32>, !fir.ref<tuple<!fir.ref<i32>>>) -> (), !fir.ref<tuple<!fir.ref<i32>>>) -> !fir.boxproc<() -> ()>
 ! CHECK:         fir.call @_QPtest_proc_dummy_other(%[[VAL_8]]) {{.*}}: (!fir.boxproc<() -> ()>) -> ()
 
-! CHECK-LABEL: func @_QFtest_proc_dummyPtest_proc_dummy_a(
+! CHECK-LABEL: func private @_QFtest_proc_dummyPtest_proc_dummy_a(
 ! CHECK-SAME:          %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "j"},
-! CHECK-SAME:          %[[VAL_1:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-SAME:          %[[VAL_1:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:         %[[VAL_2:.*]] = arith.constant 0 : i32
 ! CHECK:         %[[VAL_3:.*]] = fir.coordinate_of %[[VAL_1]], %[[VAL_2]] : (!fir.ref<tuple<!fir.ref<i32>>>, i32) -> !fir.llvm_ptr<!fir.ref<i32>>
 ! CHECK:         %[[VAL_4:.*]] = fir.load %[[VAL_3]] : !fir.llvm_ptr<!fir.ref<i32>>
@@ -525,10 +525,10 @@ end subroutine test_proc_dummy_other
 ! CHECK:         return
 ! CHECK:       }
 
-! CHECK-LABEL: func @_QFtest_proc_dummy_charPgen_message(
+! CHECK-LABEL: func private @_QFtest_proc_dummy_charPgen_message(
 ! CHECK-SAME:                                            %[[VAL_0:.*]]: !fir.ref<!fir.char<1,10>>,
 ! CHECK-SAME:                                            %[[VAL_1:.*]]: index,
-! CHECK-SAME:                                            %[[VAL_2:.*]]: !fir.ref<tuple<!fir.boxchar<1>>> {fir.host_assoc}) -> !fir.boxchar<1> attributes {fir.internal_proc} {
+! CHECK-SAME:                                            %[[VAL_2:.*]]: !fir.ref<tuple<!fir.boxchar<1>>> {fir.host_assoc}) -> !fir.boxchar<1> attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK-DAG:         %[[VAL_3:.*]] = arith.constant 0 : i32
 ! CHECK-DAG:         %[[VAL_4:.*]] = arith.constant 10 : index
 ! CHECK-DAG:         %[[VAL_5:.*]] = arith.constant false
diff --git a/flang/test/Lower/module-and-internal-proc.f90 b/flang/test/Lower/module-and-internal-proc.f90
index 1da5ce422939..0f4c6809581c 100644
--- a/flang/test/Lower/module-and-internal-proc.f90
+++ b/flang/test/Lower/module-and-internal-proc.f90
@@ -17,7 +17,7 @@ end subroutine
 subroutine test2()
   call test2internal()
   contains
-  ! CHECK-LABEL: func @_QMparentFtest2Ptest2internal()
+  ! CHECK-LABEL: func private @_QMparentFtest2Ptest2internal()
   subroutine test2internal()
     ! CHECK: fir.address_of(@_QMparentEi) : !fir.ref<i32>
     print *, i
@@ -31,7 +31,7 @@ subroutine test3()
   use parent
   call test3internal()
   contains
-  ! CHECK-LABEL: func @_QFtest3Ptest3internal()
+  ! CHECK-LABEL: func private @_QFtest3Ptest3internal()
   subroutine test3internal()
     ! CHECK: fir.address_of(@_QMparentEi) : !fir.ref<i32>
     print *, i
diff --git a/flang/test/Lower/parent-component.f90 b/flang/test/Lower/parent-component.f90
index ed1130a08493..c6bc53340643 100644
--- a/flang/test/Lower/parent-component.f90
+++ b/flang/test/Lower/parent-component.f90
@@ -29,20 +29,20 @@ contains
     type(p), intent(in) :: a
     print*, a
   end subroutine
-  ! CHECK-LABEL: func.func @_QFPprint_scalar(%{{.*}}: !fir.ref<!fir.type<_QFTp{a:i32}>> {fir.bindc_name = "a"})
+  ! CHECK-LABEL: func.func private @_QFPprint_scalar(%{{.*}}: !fir.ref<!fir.type<_QFTp{a:i32}>> {fir.bindc_name = "a"})
 
   subroutine print_p(a)
     type(p), intent(in) :: a(2)
     print*, a
   end subroutine
-  ! CHECK-LABEL: func.func @_QFPprint_p(%{{.*}}: !fir.ref<!fir.array<2x!fir.type<_QFTp{a:i32}>>> {fir.bindc_name = "a"})
+  ! CHECK-LABEL: func.func private @_QFPprint_p(%{{.*}}: !fir.ref<!fir.array<2x!fir.type<_QFTp{a:i32}>>> {fir.bindc_name = "a"})
 
   subroutine init_with_slice()
     type(c) :: y(2) = [ c(11, 21), c(12, 22) ]
     call print_p(y(:)%p)
     print*,y(:)%p
   end subroutine
-  ! CHECK-LABEL: func.func @_QFPinit_with_slice()
+  ! CHECK-LABEL: func.func private @_QFPinit_with_slice()
   ! CHECK: %[[Y:.*]] = fir.address_of(@_QFFinit_with_sliceEy) : !fir.ref<!fir.array<2x!fir.type<_QFTc{a:i32,b:i32}>>>
   ! CHECK: %[[C2:.*]] = arith.constant 2 : index
   ! CHECK: %[[C1:.*]] = arith.constant 1 : index
@@ -78,7 +78,7 @@ contains
     call print_p(y%p)
     print*,y%p
   end subroutine
-  ! CHECK-LABEL: func.func @_QFPinit_no_slice()
+  ! CHECK-LABEL: func.func private @_QFPinit_no_slice()
   ! CHECK: %[[Y:.*]] = fir.address_of(@_QFFinit_no_sliceEy) : !fir.ref<!fir.array<2x!fir.type<_QFTc{a:i32,b:i32}>>>
   ! CHECK: %[[C2:.*]] = arith.constant 2 : index
   ! CHECK: %[[SHAPE:.*]] = fir.shape %[[C2]] : (index) -> !fir.shape<1>
@@ -106,7 +106,7 @@ contains
     print*,y%p
   end subroutine
 
-  ! CHECK-LABEL: func.func @_QFPinit_allocatable()
+  ! CHECK-LABEL: func.func private @_QFPinit_allocatable()
   ! CHECK: %[[ALLOC:.*]] = fir.alloca !fir.heap<!fir.array<?x!fir.type<_QFTc{a:i32,b:i32}>>> {uniq_name = "_QFFinit_allocatableEy.addr"}
   ! CHECK: %[[LB0:.*]] = fir.alloca index {uniq_name = "_QFFinit_allocatableEy.lb0"}
   ! CHECK: %[[EXT0:.*]] = fir.alloca index {uniq_name = "_QFFinit_allocatableEy.ext0"}
@@ -139,7 +139,7 @@ contains
     print*,s%p
   end subroutine
 
-  ! CHECK-LABEL: func.func @_QFPinit_scalar()
+  ! CHECK-LABEL: func.func private @_QFPinit_scalar()
   ! CHECK: %[[S:.*]] = fir.address_of(@_QFFinit_scalarEs) : !fir.ref<!fir.type<_QFTc{a:i32,b:i32}>>
   ! CHECK: %[[CAST:.*]] = fir.convert %[[S]] : (!fir.ref<!fir.type<_QFTc{a:i32,b:i32}>>) -> !fir.ref<!fir.type<_QFTp{a:i32}>>
   ! CHECK: fir.call @_QFPprint_scalar(%[[CAST]]) {{.*}}: (!fir.ref<!fir.type<_QFTp{a:i32}>>) -> ()
@@ -154,7 +154,7 @@ contains
     print*,y%p
   end subroutine
 
-  ! CHECK-LABEL: func.func @_QFPinit_assumed(
+  ! CHECK-LABEL: func.func private @_QFPinit_assumed(
   ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x!fir.type<_QFTc{a:i32,b:i32}>>
   ! CHECK: %[[BOX:.*]] = fir.rebox %[[ARG0]] : (!fir.box<!fir.array<?x!fir.type<_QFTc{a:i32,b:i32}>>>) -> !fir.box<!fir.array<?x!fir.type<_QFTp{a:i32}>>>
 
@@ -167,7 +167,7 @@ contains
     call print_p(y%c%p)
   end subroutine
 
-  ! CHECK-LABEL: func.func @_QFPinit_existing_field
+  ! CHECK-LABEL: func.func private @_QFPinit_existing_field
   ! CHECK: %[[C2:.*]] = arith.constant 2 : index
   ! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<2x!fir.type<_QFTz{k:i32,c:!fir.type<_QFTc{a:i32,b:i32}>}>> {bindc_name = "y", uniq_name = "_QFFinit_existing_fieldEy"}
   ! CHECK: %[[FIELD_C:.*]] = fir.field_index c, !fir.type<_QFTz{k:i32,c:!fir.type<_QFTc{a:i32,b:i32}>}>
@@ -183,7 +183,7 @@ contains
     a%p = B
   end subroutine
 
-! CHECK-LABEL: func.func @_QFPparent_comp_lhs()
+! CHECK-LABEL: func.func private @_QFPparent_comp_lhs()
 ! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.type<_QFTp{a:i32}>>
 ! CHECK: %[[A:.*]] = fir.alloca !fir.type<_QFTc{a:i32,b:i32}> {bindc_name = "a", uniq_name = "_QFFparent_comp_lhsEa"}
 ! CHECK: %[[B:.*]] = fir.alloca !fir.type<_QFTp{a:i32}> {bindc_name = "b", uniq_name = "_QFFparent_comp_lhsEb"}
diff --git a/flang/test/Lower/polymorphic.f90 b/flang/test/Lower/polymorphic.f90
index a813eff690b7..15d8a86e4ef4 100644
--- a/flang/test/Lower/polymorphic.f90
+++ b/flang/test/Lower/polymorphic.f90
@@ -519,8 +519,8 @@ module polymorphic_test
     end subroutine
   end subroutine
 
-! CHECK-LABEL: func.func @_QMpolymorphic_testFhost_assocPinternal(
-! CHECK-SAME: %[[TUPLE:.*]]: !fir.ref<tuple<!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func.func private @_QMpolymorphic_testFhost_assocPinternal(
+! CHECK-SAME: %[[TUPLE:.*]]: !fir.ref<tuple<!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK: %[[POS_IN_TUPLE:.*]] = arith.constant 0 : i32
 ! CHECK: %[[COORD_OF_CLASS:.*]] = fir.coordinate_of %[[TUPLE]], %[[POS_IN_TUPLE]] : (!fir.ref<tuple<!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>>, i32) -> !fir.ref<!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>
 ! CHECK: %[[CLASS:.*]] = fir.load %[[COORD_OF_CLASS]] : !fir.ref<!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>
diff --git a/flang/test/Lower/program-units-fir-mangling.f90 b/flang/test/Lower/program-units-fir-mangling.f90
index 36631979141a..002343c45f6e 100644
--- a/flang/test/Lower/program-units-fir-mangling.f90
+++ b/flang/test/Lower/program-units-fir-mangling.f90
@@ -44,12 +44,12 @@ end module
 function foo2()
   real(4) :: foo2
 contains
-  ! CHECK-LABEL: func @_QFfoo2Psub() {
+  ! CHECK-LABEL: func private @_QFfoo2Psub() {{.*}} {
   subroutine sub()
   ! CHECK: }
   end subroutine
 
-  ! CHECK-LABEL: func @_QFfoo2Pfoo() {
+  ! CHECK-LABEL: func private @_QFfoo2Pfoo() {{.*}} {
   subroutine foo()
   ! CHECK: }
   end subroutine
@@ -58,12 +58,12 @@ end function
 ! CHECK-LABEL: func @_QPsub2()
 subroutine sUb2()
 contains
-  ! CHECK-LABEL: func @_QFsub2Psub() {
+  ! CHECK-LABEL: func private @_QFsub2Psub() {{.*}} {
   subroutine sub()
   ! CHECK: }
   end subroutine
 
-  ! CHECK-LABEL: func @_QFsub2Pfoo() {
+  ! CHECK-LABEL: func private @_QFsub2Pfoo() {{.*}} {
   subroutine Foo()
   ! CHECK: }
   end subroutine
@@ -74,7 +74,7 @@ contains
   ! CHECK-LABEL: func @_QMtestmod2Psub()
   subroutine sub()
   contains
-    ! CHECK-LABEL: func @_QMtestmod2FsubPsubsub() {
+    ! CHECK-LABEL: func private @_QMtestmod2FsubPsubsub() {{.*}} {
     subroutine subSub()
     ! CHECK: }
     end subroutine
@@ -105,7 +105,7 @@ contains
   ! CHECK-LABEL: func @_QMcolor_pointsScolor_points_aSimplPfoo()
   subroutine foo
     contains
-    ! CHECK-LABEL: func @_QMcolor_pointsScolor_points_aSimplFfooPbar() {
+    ! CHECK-LABEL: func private @_QMcolor_pointsScolor_points_aSimplFfooPbar() {{.*}} {
     subroutine bar
     ! CHECK: }
     end subroutine
@@ -128,7 +128,7 @@ end subroutine
 program test
 ! CHECK: }
 contains
-! CHECK-LABEL: func @_QFPshould_not_collide() {
+! CHECK-LABEL: func private @_QFPshould_not_collide() {{.*}} {
 subroutine should_not_collide()
 ! CHECK: }
 end subroutine
@@ -226,7 +226,7 @@ subroutine nest1
   ! CHECK:   fir.call @_QFnest1Pinner()
   call inner
 contains
-  ! CHECK-LABEL: func @_QFnest1Pinner
+  ! CHECK-LABEL: func private @_QFnest1Pinner
   subroutine inner
     ! CHECK:   %[[V_0:[0-9]+]] = fir.address_of(@_QFnest1FinnerEkk) : !fir.ref<i32>
     integer, save :: kk = 1
@@ -239,7 +239,7 @@ subroutine nest2
   ! CHECK:   fir.call @_QFnest2Pinner()
   call inner
 contains
-  ! CHECK-LABEL: func @_QFnest2Pinner
+  ! CHECK-LABEL: func private @_QFnest2Pinner
   subroutine inner
     ! CHECK:   %[[V_0:[0-9]+]] = fir.address_of(@_QFnest2FinnerEkk) : !fir.ref<i32>
     integer, save :: kk = 77
diff --git a/flang/test/lit.cfg.py b/flang/test/lit.cfg.py
index 9b8bb83ca23c..65467f2d23f4 100644
--- a/flang/test/lit.cfg.py
+++ b/flang/test/lit.cfg.py
@@ -199,3 +199,17 @@ else:
 result = lit_config.params.get("LIBPGMATH")
 if result:
     config.environment["LIBPGMATH"] = True
+
+# Add features and substitutions to test F128 math support.
+# %f128-lib substitution may be used to generate check prefixes
+# for LIT tests checking for F128 library support.
+if config.flang_runtime_f128_math_lib:
+    config.available_features.add("flang-supports-f128-math")
+    config.available_features.add(
+        "flang-f128-math-lib-" + config.flang_runtime_f128_math_lib
+    )
+    config.substitutions.append(
+        ("%f128-lib", config.flang_runtime_f128_math_lib.upper())
+    )
+else:
+    config.substitutions.append(("%f128-lib", "NONE"))
diff --git a/flang/test/lit.site.cfg.py.in b/flang/test/lit.site.cfg.py.in
index 16411d82908c..e66f1af79452 100644
--- a/flang/test/lit.site.cfg.py.in
+++ b/flang/test/lit.site.cfg.py.in
@@ -25,6 +25,7 @@ config.cc = "@CMAKE_C_COMPILER@"
 config.osx_sysroot = path(r"@CMAKE_OSX_SYSROOT@")
 config.targets_to_build = "@TARGETS_TO_BUILD@"
 config.default_sysroot = "@DEFAULT_SYSROOT@"
+config.flang_runtime_f128_math_lib = "@FLANG_RUNTIME_F128_MATH_LIB@"
 
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
diff --git a/libc/benchmarks/automemcpy/lib/CMakeLists.txt b/libc/benchmarks/automemcpy/lib/CMakeLists.txt
index 0c7d399d4023..bb6a5631f2c3 100644
--- a/libc/benchmarks/automemcpy/lib/CMakeLists.txt
+++ b/libc/benchmarks/automemcpy/lib/CMakeLists.txt
@@ -18,7 +18,8 @@ add_custom_command(
 
 add_library(automemcpy_implementations "${Implementations}")
 target_link_libraries(automemcpy_implementations PUBLIC LLVMSupport libc-memory-benchmark)
-target_include_directories(automemcpy_implementations PRIVATE ${LIBC_SOURCE_DIR} ${LIBC_AUTOMEMCPY_INCLUDE_DIR})
+target_include_directories(automemcpy_implementations PRIVATE 
+                           ${LIBC_SOURCE_DIR} ${LIBC_SOURCE_DIR}/include ${LIBC_AUTOMEMCPY_INCLUDE_DIR})
 target_compile_options(automemcpy_implementations PRIVATE ${LIBC_COMPILE_OPTIONS_NATIVE} "SHELL:-mllvm -combiner-global-alias-analysis" -fno-builtin)
 llvm_update_compile_flags(automemcpy_implementations)
 
diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index c7ccd392354c..72b04822d8b8 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -87,6 +87,7 @@ function(_get_common_compile_options output_var flags)
     list(APPEND compile_options "-fvisibility=hidden")
     list(APPEND compile_options "-fconvergent-functions")
     list(APPEND compile_options "-flto")
+    list(APPEND compile_options "-Wno-multi-gpu")
 
     if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
       list(APPEND compile_options "-Wno-unknown-cuda-version")
diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
index 8a84c82206ba..5469799f0239 100644
--- a/libc/cmake/modules/LLVMLibCObjectRules.cmake
+++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -59,6 +59,7 @@ function(create_object_library fq_target_name)
   )
   target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   target_compile_options(${fq_target_name} PRIVATE ${compile_options})
 
   # The NVPTX target is installed as LLVM-IR but the internal testing toolchain
@@ -73,6 +74,7 @@ function(create_object_library fq_target_name)
     )
     target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
     target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+    target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
     target_compile_options(${internal_target_name} PRIVATE ${compile_options}
                            -fno-lto -march=${LIBC_GPU_TARGET_ARCHITECTURE})
   endif()
@@ -279,6 +281,7 @@ function(create_entrypoint_object fq_target_name)
   target_compile_options(${internal_target_name} BEFORE PRIVATE ${common_compile_options})
   target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   add_dependencies(${internal_target_name} ${full_deps_list})
   target_link_libraries(${internal_target_name} ${full_deps_list})
 
@@ -300,6 +303,7 @@ function(create_entrypoint_object fq_target_name)
   target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} -DLIBC_COPT_PUBLIC_PACKAGING)
   target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   add_dependencies(${fq_target_name} ${full_deps_list})
   target_link_libraries(${fq_target_name} ${full_deps_list})
 
@@ -307,7 +311,7 @@ function(create_entrypoint_object fq_target_name)
     ${fq_target_name}
     PROPERTIES
       ENTRYPOINT_NAME ${ADD_ENTRYPOINT_OBJ_NAME}
-      TARGET_TYPE ${ENTRYPOINT_OBJ_TARGET_TYPE}
+      TARGET_TYPE ${entrypoint_target_type}
       OBJECT_FILE "$<TARGET_OBJECTS:${fq_target_name}>"
       CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
       DEPS "${fq_deps_list}"
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 76ce6754bd73..5981d427b71f 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -184,6 +184,7 @@ function(create_libc_unittest fq_target_name)
   )
   target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
 
   if(NOT LIBC_UNITTEST_CXX_STANDARD)
@@ -317,6 +318,7 @@ function(add_libc_fuzzer target_name)
   )
   target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
 
   target_link_libraries(${fq_target_name} PRIVATE
     ${link_object_files}
@@ -457,13 +459,14 @@ function(add_integration_test test_name)
       PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
 
   _get_hermetic_test_compile_options(compile_options "${INTEGRATION_TEST_COMPILE_OPTIONS}")
   target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
 
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
     target_link_options(${fq_build_target_name} PRIVATE
-      ${LIBC_COMPILE_OPTIONS_DEFAULT}
+      ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
       -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
       "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
       "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
@@ -471,7 +474,7 @@ function(add_integration_test test_name)
     # We need to use the internal object versions for NVPTX.
     set(internal_suffix ".__internal__")
     target_link_options(${fq_build_target_name} PRIVATE
-      ${LIBC_COMPILE_OPTIONS_DEFAULT}
+      ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
       "-Wl,--suppress-stack-size-warning"
       -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
       "--cuda-path=${LIBC_CUDA_ROOT}")
@@ -632,6 +635,7 @@ function(add_libc_hermetic_test test_name)
   _get_hermetic_test_compile_options(compile_options "${HERMETIC_TEST_COMPILE_OPTIONS}")
   target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   _get_hermetic_test_compile_options(compile_options "${HERMETIC_TEST_COMPILE_OPTIONS}")
   target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
 
@@ -647,14 +651,14 @@ function(add_libc_hermetic_test test_name)
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
     target_link_options(${fq_build_target_name} PRIVATE
       ${LIBC_COMPILE_OPTIONS_DEFAULT}
-      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
+      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto -Wno-multi-gpu
       "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
       "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
   elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
     # We need to use the internal object versions for NVPTX.
     set(internal_suffix ".__internal__")
     target_link_options(${fq_build_target_name} PRIVATE
-      ${LIBC_COMPILE_OPTIONS_DEFAULT}
+      ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
       "-Wl,--suppress-stack-size-warning"
       -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
       "--cuda-path=${LIBC_CUDA_ROOT}")
diff --git a/libc/cmake/modules/compiler_features/check_fixed_point.cpp b/libc/cmake/modules/compiler_features/check_fixed_point.cpp
index a5192697d43f..9199340fe652 100644
--- a/libc/cmake/modules/compiler_features/check_fixed_point.cpp
+++ b/libc/cmake/modules/compiler_features/check_fixed_point.cpp
@@ -1,4 +1,4 @@
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 #ifndef LIBC_COMPILER_HAS_FIXED_POINT
 #error unsupported
diff --git a/libc/config/config.json b/libc/config/config.json
index 6a208cc55661..b73c47b1a14b 100644
--- a/libc/config/config.json
+++ b/libc/config/config.json
@@ -15,6 +15,10 @@
     "LIBC_CONF_PRINTF_FLOAT_TO_STR_USE_MEGA_LONG_DOUBLE_TABLE": {
       "value": false,
       "doc": "Use large table for better printf long double performance."
+    },
+    "LIBC_CONF_PRINTF_DISABLE_FIXED_POINT": {
+      "value": false,
+      "doc": "Disable printing fixed point values in printf and friends."
     }
   },
   "string": {
diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index 5224e92bbcc5..fca5315fc4f0 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -239,6 +239,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.ilogbf
     libc.src.math.ldexp
     libc.src.math.ldexpf
+    libc.src.math.llogb
+    libc.src.math.llogbf
     libc.src.math.llrint
     libc.src.math.llrintf
     libc.src.math.llround
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 8a6c160c0993..a6dc74101dbc 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -341,6 +341,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.ilogb
     libc.src.math.ilogbf
     libc.src.math.ilogbl
+    libc.src.math.llogb
+    libc.src.math.llogbf
+    libc.src.math.llogbl
     libc.src.math.llrint
     libc.src.math.llrintf
     libc.src.math.llrintl
@@ -422,7 +425,10 @@ if(LIBC_COMPILER_HAS_FLOAT128)
     libc.src.math.fmaxf128
     libc.src.math.fminf128
     libc.src.math.frexpf128
+    libc.src.math.ilogbf128
     libc.src.math.ldexpf128
+    libc.src.math.llogbf128
+    libc.src.math.logbf128
     libc.src.math.roundf128
     libc.src.math.sqrtf128
     libc.src.math.truncf128
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index 7df190490888..2fca96c5601b 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -217,6 +217,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.ldexp
     libc.src.math.ldexpf
     libc.src.math.ldexpl
+    libc.src.math.llogb
+    libc.src.math.llogbf
+    libc.src.math.llogbl
     libc.src.math.llrint
     libc.src.math.llrintf
     libc.src.math.llrintl
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 5c8cc7618a9e..fc4d8828f4c6 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -353,6 +353,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.ldexp
     libc.src.math.ldexpf
     libc.src.math.ldexpl
+    libc.src.math.llogb
+    libc.src.math.llogbf
+    libc.src.math.llogbl
     libc.src.math.llrint
     libc.src.math.llrintf
     libc.src.math.llrintl
@@ -431,7 +434,10 @@ if(LIBC_COMPILER_HAS_FLOAT128)
     libc.src.math.fmaxf128
     libc.src.math.fminf128
     libc.src.math.frexpf128
+    libc.src.math.ilogbf128
     libc.src.math.ldexpf128
+    libc.src.math.llogbf128
+    libc.src.math.logbf128
     libc.src.math.roundf128
     libc.src.math.sqrtf128
     libc.src.math.truncf128
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index beb7a5ed448d..27c9a42934c2 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -132,6 +132,21 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdbit.stdc_first_trailing_one_ui
     libc.src.stdbit.stdc_first_trailing_one_ul
     libc.src.stdbit.stdc_first_trailing_one_ull
+    libc.src.stdbit.stdc_count_zeros_uc
+    libc.src.stdbit.stdc_count_zeros_us
+    libc.src.stdbit.stdc_count_zeros_ui
+    libc.src.stdbit.stdc_count_zeros_ul
+    libc.src.stdbit.stdc_count_zeros_ull
+    libc.src.stdbit.stdc_count_ones_uc
+    libc.src.stdbit.stdc_count_ones_us
+    libc.src.stdbit.stdc_count_ones_ui
+    libc.src.stdbit.stdc_count_ones_ul
+    libc.src.stdbit.stdc_count_ones_ull
+    libc.src.stdbit.stdc_has_single_bit_uc
+    libc.src.stdbit.stdc_has_single_bit_us
+    libc.src.stdbit.stdc_has_single_bit_ui
+    libc.src.stdbit.stdc_has_single_bit_ul
+    libc.src.stdbit.stdc_has_single_bit_ull
 
     # stdlib.h entrypoints
     libc.src.stdlib.abs
@@ -355,6 +370,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.ldexp
     libc.src.math.ldexpf
     libc.src.math.ldexpl
+    libc.src.math.llogb
+    libc.src.math.llogbf
+    libc.src.math.llogbl
     libc.src.math.llrint
     libc.src.math.llrintf
     libc.src.math.llrintl
@@ -435,7 +453,10 @@ if(LIBC_COMPILER_HAS_FLOAT128)
     libc.src.math.fmaxf128
     libc.src.math.fminf128
     libc.src.math.frexpf128
+    libc.src.math.ilogbf128
     libc.src.math.ldexpf128
+    libc.src.math.llogbf128
+    libc.src.math.logbf128
     libc.src.math.roundf128
     libc.src.math.sqrtf128
     libc.src.math.truncf128
@@ -463,6 +484,12 @@ if(LIBC_COMPILER_HAS_FIXED_POINT)
     libc.src.stdfix.roundur
     libc.src.stdfix.roundulk
     libc.src.stdfix.roundulr
+    libc.src.stdfix.sqrtuhk
+    libc.src.stdfix.sqrtuhr
+    libc.src.stdfix.sqrtuk
+    libc.src.stdfix.sqrtur
+    # libc.src.stdfix.sqrtulk
+    libc.src.stdfix.sqrtulr
   )
 endif()
 
diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst
index 9997dde6cf89..a177550647bd 100644
--- a/libc/docs/configure.rst
+++ b/libc/docs/configure.rst
@@ -26,6 +26,7 @@ overrides in ``config/<platform>/config.json`` and ``config/<platform>/<arch>/co
 to learn about the defaults for your platform and target.
 
 * **"printf" options**
+    - ``LIBC_CONF_PRINTF_DISABLE_FIXED_POINT``: Disable printing fixed point values in printf and friends.
     - ``LIBC_CONF_PRINTF_DISABLE_FLOAT``: Disable printing floating point values in printf and friends.
     - ``LIBC_CONF_PRINTF_DISABLE_INDEX_MODE``: Disable index mode in the printf format string.
     - ``LIBC_CONF_PRINTF_DISABLE_WRITE_INT``: Disable handling of %n in printf format string.
diff --git a/libc/docs/dev/printf_behavior.rst b/libc/docs/dev/printf_behavior.rst
index 7128c738d192..00d6c83f4b0d 100644
--- a/libc/docs/dev/printf_behavior.rst
+++ b/libc/docs/dev/printf_behavior.rst
@@ -62,6 +62,13 @@ When set, this flag disables support for floating point numbers and all their
 conversions (%a, %f, %e, %g); any floating point number conversion will be
 treated as invalid. This reduces code size.
 
+LIBC_COPT_PRINTF_DISABLE_FIXED_POINT
+------------------------------------
+When set, this flag disables support for fixed point numbers and all their
+conversions (%r, %k); any fixed point number conversion will be treated as
+invalid. This reduces code size. This has no effect if the current compiler does
+not support fixed point numbers.
+
 LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
 ----------------------------------
 When set, this flag disables the nullptr checks in %n and %s.
@@ -191,3 +198,8 @@ original conversion.
 The %p conversion will display a null pointer as if it was the string
 "(nullptr)" passed to a "%s" conversion, with all other options remaining the
 same as the original conversion.
+
+The %r, %R, %k, and %K fixed point number format specifiers are accepted as
+defined in ISO/IEC TR 18037 (the fixed point number extension). These are
+available when the compiler is detected as having support for fixed point
+numbers and the LIBC_COPT_PRINTF_DISABLE_FIXED_POINT flag is not set.
diff --git a/libc/docs/dev/undefined_behavior.rst b/libc/docs/dev/undefined_behavior.rst
index 0cb25c7f2a23..6e73a305e8e0 100644
--- a/libc/docs/dev/undefined_behavior.rst
+++ b/libc/docs/dev/undefined_behavior.rst
@@ -20,6 +20,7 @@ guidelines and the resulting code should behave predictably even in unexpected
 situations.
 
 #. Follow the standards.
+    #. If there is no standard, first ask yourself if this implementation is necessary (are there users who need this functionality?). If it truly is, then match existing implementations. Creating competing designs just causes confusion (see the history of qsort_r).
 #. Avoid giving an incorrect answer.
     #. In general, correct answer > correct answer (wrong format) > no answer > crash the program >>>>>>> incorrect answer.
     #. The C library is called frequently in performance critical situations, and so can't afford to do thorough error checking and correction.
@@ -61,7 +62,7 @@ Often the standard will imply an intended behavior through what it states is und
 
 Ignoring Bug-For-Bug Compatibility
 ----------------------------------
-Any long running implementations will have bugs and deviations from the standard. Hyrum's Law states that “all observable behaviors of your system will be depended on by somebody” which includes these bugs. An example of a long-standing bug is glibc's scanf float parsing behavior. The behavior is specifically defined in the standard, but it isn't adhered to by all libc implementations. There is a longstanding bug in glibc where it incorrectly parses the string 100er and this caused the C standard to add that specific example to the definition for scanf. The intended behavior is for scanf, when parsing a float, to parse the longest possibly valid prefix and then accept it if and only if that complete parsed value is a float. In the case of 100er the longest possibly valid prefix is 100e but the float parsed from that string is only 100. Since there is no number after the e it shouldn't be included in the float, so scanf should return a parsing error. For LLVM's libc it was decided to follow the standard, even though glibc's version is slightly simpler to implement and this edge case is rare. Following the standard must be the first priority, since that's the goal of the library.
+Any long running implementations will have bugs and deviations from the standard. Hyrum's Law states that “all observable behaviors of your system will be depended on by somebody” which includes these bugs. An example of a long-standing bug is glibc's scanf float parsing behavior. The behavior is specifically defined in the standard, but it isn't adhered to by all libc implementations. There is a longstanding bug in glibc where it incorrectly parses the string 100er and this caused the C standard to add that specific example to the definition for scanf. The intended behavior is for scanf, when parsing a float, to parse the longest possibly valid prefix and then accept it if and only if that complete parsed value is a float. In the case of 100er the longest possibly valid prefix is 100e but the float parsed from that string is only 100. Since there is no number after the e it shouldn't be included in the float, so scanf should return a parsing error. For LLVM's libc it was decided to follow the standard, even though glibc's version is slightly simpler to implement and this edge case is rare. Following the standard must be the first priority, since that's the goal of the library. If there is no standard, then matching another implementation (even bug-for-bug) may be necessary, but before you implement an unstandardized function first consider if anyone will actually use it at all.
 
 Design Decisions
 ================
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 8d584338c10f..80d12718edcc 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -185,6 +185,8 @@ Basic Operations
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | ilogbl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| ilogf128     | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | ldexp        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | ldexpf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
@@ -193,6 +195,14 @@ Basic Operations
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | ldexpf128    | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| llogb        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| llogbf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| llogbl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| llogf128     | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | llrint       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | llrintf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
@@ -211,6 +221,8 @@ Basic Operations
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | logbl        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| logf128      | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | lrint        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | lrintf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
diff --git a/libc/docs/math/stdfix.rst b/libc/docs/math/stdfix.rst
index 080066e53bd2..79f499e61f12 100644
--- a/libc/docs/math/stdfix.rst
+++ b/libc/docs/math/stdfix.rst
@@ -78,7 +78,7 @@ Fixed-point Arithmetics
 +---------------+----------------+-------------+---------------+------------+----------------+-------------+----------------+-------------+---------------+------------+----------------+-------------+
 | round         | |check|        | |check|     | |check|       | |check|    | |check|        | |check|     | |check|        | |check|     | |check|       | |check|    | |check|        | |check|     |
 +---------------+----------------+-------------+---------------+------------+----------------+-------------+----------------+-------------+---------------+------------+----------------+-------------+
-| sqrt          |                |             |               |            |                |             |                |             |               |            |                |             |
+| sqrt          | |check|        |             | |check|       |            | |check|        |             | |check|        |             | |check|       |            |                |             |
 +---------------+----------------+-------------+---------------+------------+----------------+-------------+----------------+-------------+---------------+------------+----------------+-------------+
 
 ================== =========
diff --git a/libc/docs/stdbit.rst b/libc/docs/stdbit.rst
index 3bd83ff70c89..b579e9dbbc2f 100644
--- a/libc/docs/stdbit.rst
+++ b/libc/docs/stdbit.rst
@@ -71,21 +71,21 @@ stdc_first_trailing_one_us   |check|
 stdc_first_trailing_one_ui   |check|
 stdc_first_trailing_one_ul   |check|
 stdc_first_trailing_one_ull  |check|
-stdc_count_zeros_uc
-stdc_count_zeros_us
-stdc_count_zeros_ui
-stdc_count_zeros_ul
-stdc_count_zeros_ull
-stdc_count_ones_uc
-stdc_count_ones_us
-stdc_count_ones_ui
-stdc_count_ones_ul
-stdc_count_ones_ull
-stdc_has_single_bit_uc
-stdc_has_single_bit_us
-stdc_has_single_bit_ui
-stdc_has_single_bit_ul
-stdc_has_single_bit_ull
+stdc_count_zeros_uc          |check|
+stdc_count_zeros_us          |check|
+stdc_count_zeros_ui          |check|
+stdc_count_zeros_ul          |check|
+stdc_count_zeros_ull         |check|
+stdc_count_ones_uc           |check|
+stdc_count_ones_us           |check|
+stdc_count_ones_ui           |check|
+stdc_count_ones_ul           |check|
+stdc_count_ones_ull          |check|
+stdc_has_single_bit_uc       |check|
+stdc_has_single_bit_us       |check|
+stdc_has_single_bit_ui       |check|
+stdc_has_single_bit_ul       |check|
+stdc_has_single_bit_ull      |check|
 stdc_bit_width_uc
 stdc_bit_width_us
 stdc_bit_width_ui
@@ -122,9 +122,9 @@ stdc_first_leading_zero    |check|
 stdc_first_leading_one     |check|
 stdc_first_trailing_zero   |check|
 stdc_first_trailing_one    |check|
-stdc_count_zeros
-stdc_count_ones
-stdc_has_single_bit
+stdc_count_zeros           |check|
+stdc_count_ones            |check|
+stdc_has_single_bit        |check|
 stdc_bit_width
 stdc_bit_floor
 stdc_bit_ceil
diff --git a/libc/fuzzing/stdio/CMakeLists.txt b/libc/fuzzing/stdio/CMakeLists.txt
index 22de67d42747..8f89baa70200 100644
--- a/libc/fuzzing/stdio/CMakeLists.txt
+++ b/libc/fuzzing/stdio/CMakeLists.txt
@@ -15,3 +15,17 @@ add_libc_fuzzer(
     libc.src.stdio.snprintf
     libc.src.__support.FPUtil.fp_bits
 )
+
+if(LIBC_COMPILER_HAS_FIXED_POINT)
+  add_libc_fuzzer(
+    printf_fixed_conv_fuzz
+    NEED_MPFR
+    SRCS
+      printf_fixed_conv_fuzz.cpp
+    DEPENDS
+      libc.src.stdio.snprintf
+      libc.src.__support.fixed_point.fx_bits
+    COMPILE_OPTIONS
+      -ffixed-point # TODO: add -ffixed-point to fuzz tests automatically
+  )
+endif()
diff --git a/libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp b/libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp
new file mode 100644
index 000000000000..c385c3a8f3e4
--- /dev/null
+++ b/libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp
@@ -0,0 +1,133 @@
+//===-- printf_fixed_conv_fuzz.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Fuzzing test for llvm-libc printf %f/e/g/a implementations.
+///
+//===----------------------------------------------------------------------===//
+#include "src/stdio/snprintf.h"
+
+#include "llvm-libc-macros/stdfix-macros.h"
+#include "src/__support/fixed_point/fx_bits.h"
+#include "src/__support/fixed_point/fx_rep.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "utils/MPFRWrapper/mpfr_inc.h"
+
+constexpr int MAX_SIZE = 10000;
+
+inline bool simple_streq(char *first, char *second, int length) {
+  for (int i = 0; i < length; ++i)
+    if (first[i] != second[i])
+      return false;
+
+  return true;
+}
+
+inline int clamp(int num, int max) {
+  if (num > max)
+    return max;
+  if (num < -max)
+    return -max;
+  return num;
+}
+
+enum class TestResult {
+  Success,
+  BufferSizeFailed,
+  LengthsDiffer,
+  StringsNotEqual,
+};
+
+template <typename F>
+inline TestResult test_vals(const char *fmt, uint64_t num, int prec,
+                            int width) {
+  typename LIBC_NAMESPACE::fixed_point::FXRep<F>::StorageType raw_num = num;
+
+  auto raw_num_bits = LIBC_NAMESPACE::fixed_point::FXBits<F>(raw_num);
+
+  // This needs to be a float with enough bits of precision to hold the fixed
+  // point number.
+  static_assert(sizeof(long double) > sizeof(long accum));
+
+  // build a long double that is equivalent to the fixed point number.
+  long double ld_num =
+      static_cast<long double>(raw_num_bits.get_integral()) +
+      (static_cast<long double>(raw_num_bits.get_fraction()) /
+       static_cast<long double>(1ll << raw_num_bits.get_exponent()));
+
+  if (raw_num_bits.get_sign())
+    ld_num = -ld_num;
+
+  // Call snprintf on a nullptr to get the buffer size.
+  int buffer_size = LIBC_NAMESPACE::snprintf(nullptr, 0, fmt, width, prec, num);
+
+  if (buffer_size < 0)
+    return TestResult::BufferSizeFailed;
+
+  char *test_buff = new char[buffer_size + 1];
+  char *reference_buff = new char[buffer_size + 1];
+
+  int test_result = 0;
+  int reference_result = 0;
+
+  test_result = LIBC_NAMESPACE::snprintf(test_buff, buffer_size + 1, fmt, width,
+                                         prec, num);
+
+  // The fixed point format is defined to be %f equivalent.
+  reference_result = mpfr_snprintf(reference_buff, buffer_size + 1, "%*.*Lf",
+                                   width, prec, ld_num);
+
+  // All of these calls should return that they wrote the same amount.
+  if (test_result != reference_result || test_result != buffer_size)
+    return TestResult::LengthsDiffer;
+
+  if (!simple_streq(test_buff, reference_buff, buffer_size))
+    return TestResult::StringsNotEqual;
+
+  delete[] test_buff;
+  delete[] reference_buff;
+  return TestResult::Success;
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  // const uint8_t raw_data[] = {0x8d,0x43,0x40,0x0,0x0,0x0,};
+  // data = raw_data;
+  // size = sizeof(raw_data);
+  int prec = 0;
+  int width = 0;
+
+  LIBC_NAMESPACE::fixed_point::FXRep<long accum>::StorageType raw_num = 0;
+
+  // Copy as many bytes of data as will fit into num, prec, and with. Any extras
+  // are ignored.
+  for (size_t cur = 0; cur < size; ++cur) {
+    if (cur < sizeof(raw_num)) {
+      raw_num = (raw_num << 8) + data[cur];
+    } else if (cur < sizeof(raw_num) + sizeof(prec)) {
+      prec = (prec << 8) + data[cur];
+    } else if (cur < sizeof(raw_num) + sizeof(prec) + sizeof(width)) {
+      width = (width << 8) + data[cur];
+    }
+  }
+
+  width = clamp(width, MAX_SIZE);
+  prec = clamp(prec, MAX_SIZE);
+
+  TestResult result;
+  result = test_vals<long accum>("%*.*lk", raw_num, prec, width);
+  if (result != TestResult::Success)
+    __builtin_trap();
+
+  result = test_vals<unsigned long accum>("%*.*lK", raw_num, prec, width);
+  if (result != TestResult::Success)
+    __builtin_trap();
+
+  return 0;
+}
diff --git a/libc/include/llvm-libc-macros/CMakeLists.txt b/libc/include/llvm-libc-macros/CMakeLists.txt
index 6e0875829127..157b786aa7e8 100644
--- a/libc/include/llvm-libc-macros/CMakeLists.txt
+++ b/libc/include/llvm-libc-macros/CMakeLists.txt
@@ -83,6 +83,8 @@ add_macro_header(
   math_macros
   HDR
     math-macros.h
+  DEPENDS
+    .limits_macros
 )
 
 add_macro_header(
diff --git a/libc/include/llvm-libc-macros/math-macros.h b/libc/include/llvm-libc-macros/math-macros.h
index 136670e665e6..9f8edd954b7e 100644
--- a/libc/include/llvm-libc-macros/math-macros.h
+++ b/libc/include/llvm-libc-macros/math-macros.h
@@ -9,6 +9,8 @@
 #ifndef __LLVM_LIBC_MACROS_MATH_MACROS_H
 #define __LLVM_LIBC_MACROS_MATH_MACROS_H
 
+#include "limits-macros.h"
+
 #define MATH_ERRNO 1
 #define MATH_ERREXCEPT 2
 
@@ -16,8 +18,11 @@
 #define INFINITY __builtin_inf()
 #define NAN __builtin_nanf("")
 
-#define FP_ILOGB0 (-__INT_MAX__ - 1)
-#define FP_ILOGBNAN __INT_MAX__
+#define FP_ILOGB0 (-INT_MAX - 1)
+#define FP_ILOGBNAN INT_MAX
+
+#define FP_LLOGB0 (-LONG_MAX - 1)
+#define FP_LLOGBNAN LONG_MAX
 
 #define isfinite(x) __builtin_isfinite(x)
 #define isinf(x) __builtin_isinf(x)
@@ -25,8 +30,10 @@
 
 #ifdef __FAST_MATH__
 #define math_errhandling 0
-#elif defined __NO_MATH_ERRNO__
+#elif defined(__NO_MATH_ERRNO__)
 #define math_errhandling (MATH_ERREXCEPT)
+#elif defined(__NVPTX__) || defined(__AMDGPU__)
+#define math_errhandling (MATH_ERRNO)
 #else
 #define math_errhandling (MATH_ERRNO | MATH_ERREXCEPT)
 #endif
diff --git a/libc/include/llvm-libc-macros/stdbit-macros.h b/libc/include/llvm-libc-macros/stdbit-macros.h
index 0c97da96ebba..e3a36d10ed92 100644
--- a/libc/include/llvm-libc-macros/stdbit-macros.h
+++ b/libc/include/llvm-libc-macros/stdbit-macros.h
@@ -131,6 +131,47 @@ inline unsigned stdc_first_trailing_one(unsigned long x) {
 inline unsigned stdc_first_trailing_one(unsigned long long x) {
   return stdc_first_trailing_one_ull(x);
 }
+inline unsigned stdc_count_zeros(unsigned char x) {
+  return stdc_count_zeros_uc(x);
+}
+inline unsigned stdc_count_zeros(unsigned short x) {
+  return stdc_count_zeros_us(x);
+}
+inline unsigned stdc_count_zeros(unsigned x) { return stdc_count_zeros_ui(x); }
+inline unsigned stdc_count_zeros(unsigned long x) {
+  return stdc_count_zeros_ul(x);
+}
+inline unsigned stdc_count_zeros(unsigned long long x) {
+  return stdc_count_zeros_ull(x);
+}
+inline unsigned stdc_count_ones(unsigned char x) {
+  return stdc_count_ones_uc(x);
+}
+inline unsigned stdc_count_ones(unsigned short x) {
+  return stdc_count_ones_us(x);
+}
+inline unsigned stdc_count_ones(unsigned x) { return stdc_count_ones_ui(x); }
+inline unsigned stdc_count_ones(unsigned long x) {
+  return stdc_count_ones_ul(x);
+}
+inline unsigned stdc_count_ones(unsigned long long x) {
+  return stdc_count_ones_ull(x);
+}
+inline bool stdc_has_single_bit(unsigned char x) {
+  return stdc_has_single_bit_uc(x);
+}
+inline bool stdc_has_single_bit(unsigned short x) {
+  return stdc_has_single_bit_us(x);
+}
+inline bool stdc_has_single_bit(unsigned x) {
+  return stdc_has_single_bit_ui(x);
+}
+inline bool stdc_has_single_bit(unsigned long x) {
+  return stdc_has_single_bit_ul(x);
+}
+inline bool stdc_has_single_bit(unsigned long long x) {
+  return stdc_has_single_bit_ull(x);
+}
 #else
 #define stdc_leading_zeros(x)                                                  \
   _Generic((x),                                                                \
@@ -188,6 +229,27 @@ inline unsigned stdc_first_trailing_one(unsigned long long x) {
       unsigned: stdc_first_trailing_one_ui,                                    \
       unsigned long: stdc_first_trailing_one_ul,                               \
       unsigned long long: stdc_first_trailing_one_ull)(x)
+#define stdc_count_zeros(x)                                                    \
+  _Generic((x),                                                                \
+      unsigned char: stdc_count_zeros_uc,                                      \
+      unsigned short: stdc_count_zeros_us,                                     \
+      unsigned: stdc_count_zeros_ui,                                           \
+      unsigned long: stdc_count_zeros_ul,                                      \
+      unsigned long long: stdc_count_zeros_ull)(x)
+#define stdc_count_ones(x)                                                     \
+  _Generic((x),                                                                \
+      unsigned char: stdc_count_ones_uc,                                       \
+      unsigned short: stdc_count_ones_us,                                      \
+      unsigned: stdc_count_ones_ui,                                            \
+      unsigned long: stdc_count_ones_ul,                                       \
+      unsigned long long: stdc_count_ones_ull)(x)
+#define stdc_has_single_bit(x)                                                 \
+  _Generic((x),                                                                \
+      unsigned char: stdc_has_single_bit_uc,                                   \
+      unsigned short: stdc_has_single_bit_us,                                  \
+      unsigned: stdc_has_single_bit_ui,                                        \
+      unsigned long: stdc_has_single_bit_ul,                                   \
+      unsigned long long: stdc_has_single_bit_ull)(x)
 #endif // __cplusplus
 
 #endif // __LLVM_LIBC_MACROS_STDBIT_MACROS_H
diff --git a/libc/include/llvm-libc-types/float128.h b/libc/include/llvm-libc-types/float128.h
index 61a094fdb96b..1907a5e3ece7 100644
--- a/libc/include/llvm-libc-types/float128.h
+++ b/libc/include/llvm-libc-types/float128.h
@@ -9,7 +9,7 @@
 #ifndef __LLVM_LIBC_TYPES_FLOAT128_H__
 #define __LLVM_LIBC_TYPES_FLOAT128_H__
 
-#include <include/llvm-libc-macros/float-macros.h> // LDBL_MANT_DIG
+#include "llvm-libc-macros/float-macros.h" // LDBL_MANT_DIG
 
 // Currently, C23 `_Float128` type is only defined as a built-in type in GCC 7
 // or later, and only for C.  For C++, or for clang, `__float128` is defined
diff --git a/libc/spec/spec.td b/libc/spec/spec.td
index 90c076580be1..998f37fb26de 100644
--- a/libc/spec/spec.td
+++ b/libc/spec/spec.td
@@ -51,6 +51,7 @@ def LongDoubleType : NamedType<"long double">;
 def CharType : NamedType<"char">;
 def UnsignedCharType : NamedType<"unsigned char">;
 def UnsignedShortType : NamedType<"unsigned short">;
+def BoolType : NamedType<"bool">;
 
 def Float128Type : NamedType<"float128">;
 
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 3fe19fae4c2e..5b97255b8997 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -414,6 +414,12 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"ilogb", RetValSpec<IntType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"ilogbf", RetValSpec<IntType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"ilogbl", RetValSpec<IntType>, [ArgSpec<LongDoubleType>]>,
+          GuardedFunctionSpec<"ilogbf128", RetValSpec<IntType>, [ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
+
+          FunctionSpec<"llogb", RetValSpec<LongType>, [ArgSpec<DoubleType>]>,
+          FunctionSpec<"llogbf", RetValSpec<LongType>, [ArgSpec<FloatType>]>,
+          FunctionSpec<"llogbl", RetValSpec<LongType>, [ArgSpec<LongDoubleType>]>,
+          GuardedFunctionSpec<"llogbf128", RetValSpec<LongType>, [ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"ldexp", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntType>]>,
           FunctionSpec<"ldexpf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<IntType>]>,
@@ -435,6 +441,7 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"logb", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"logbf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"logbl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
+          GuardedFunctionSpec<"logbf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"modf", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoublePtr>]>,
           FunctionSpec<"modff", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatPtr>]>,
@@ -790,7 +797,10 @@ def StdC : StandardSpec<"stdc"> {
         Macro<"stdc_first_leading_zero">,
         Macro<"stdc_first_leading_one">,
         Macro<"stdc_first_trailing_zero">,
-        Macro<"stdc_first_trailing_one">
+        Macro<"stdc_first_trailing_one">,
+        Macro<"stdc_count_zeros">,
+        Macro<"stdc_count_ones">,
+        Macro<"stdc_has_single_bit">
       ], // Macros
       [], // Types
       [], // Enumerations
@@ -829,7 +839,22 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"stdc_first_trailing_one_us", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedShortType>]>,
           FunctionSpec<"stdc_first_trailing_one_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
           FunctionSpec<"stdc_first_trailing_one_ul", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongType>]>,
-          FunctionSpec<"stdc_first_trailing_one_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>
+          FunctionSpec<"stdc_first_trailing_one_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>,
+          FunctionSpec<"stdc_count_zeros_uc", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedCharType>]>,
+          FunctionSpec<"stdc_count_zeros_us", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedShortType>]>,
+          FunctionSpec<"stdc_count_zeros_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
+          FunctionSpec<"stdc_count_zeros_ul", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongType>]>,
+          FunctionSpec<"stdc_count_zeros_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>,
+          FunctionSpec<"stdc_count_ones_uc", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedCharType>]>,
+          FunctionSpec<"stdc_count_ones_us", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedShortType>]>,
+          FunctionSpec<"stdc_count_ones_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
+          FunctionSpec<"stdc_count_ones_ul", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongType>]>,
+          FunctionSpec<"stdc_count_ones_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>,
+          FunctionSpec<"stdc_has_single_bit_uc", RetValSpec<BoolType>, [ArgSpec<UnsignedCharType>]>,
+          FunctionSpec<"stdc_has_single_bit_us", RetValSpec<BoolType>, [ArgSpec<UnsignedShortType>]>,
+          FunctionSpec<"stdc_has_single_bit_ui", RetValSpec<BoolType>, [ArgSpec<UnsignedIntType>]>,
+          FunctionSpec<"stdc_has_single_bit_ul", RetValSpec<BoolType>, [ArgSpec<UnsignedLongType>]>,
+          FunctionSpec<"stdc_has_single_bit_ull", RetValSpec<BoolType>, [ArgSpec<UnsignedLongLongType>]>
       ] // Functions
   >;
 
diff --git a/libc/spec/stdc_ext.td b/libc/spec/stdc_ext.td
index 6620142146c4..be1e6d4ba2fc 100644
--- a/libc/spec/stdc_ext.td
+++ b/libc/spec/stdc_ext.td
@@ -47,6 +47,14 @@ def StdcExt : StandardSpec<"stdc_ext"> {
           GuardedFunctionSpec<"rounduhk", RetValSpec<UnsignedShortAccumType>, [ArgSpec<UnsignedShortAccumType>, ArgSpec<IntType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
           GuardedFunctionSpec<"rounduk", RetValSpec<UnsignedAccumType>, [ArgSpec<UnsignedAccumType>, ArgSpec<IntType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
           GuardedFunctionSpec<"roundulk", RetValSpec<UnsignedLongAccumType>, [ArgSpec<UnsignedLongAccumType>, ArgSpec<IntType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
+
+          GuardedFunctionSpec<"sqrtuhr", RetValSpec<UnsignedShortFractType>, [ArgSpec<UnsignedShortFractType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
+          GuardedFunctionSpec<"sqrtur", RetValSpec<UnsignedFractType>, [ArgSpec<UnsignedFractType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
+          GuardedFunctionSpec<"sqrtulr", RetValSpec<UnsignedLongFractType>, [ArgSpec<UnsignedLongFractType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
+
+          GuardedFunctionSpec<"sqrtuhk", RetValSpec<UnsignedShortAccumType>, [ArgSpec<UnsignedShortAccumType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
+          GuardedFunctionSpec<"sqrtuk", RetValSpec<UnsignedAccumType>, [ArgSpec<UnsignedAccumType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
+          GuardedFunctionSpec<"sqrtulk", RetValSpec<UnsignedLongAccumType>, [ArgSpec<UnsignedLongAccumType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
       ]
   >;
 
diff --git a/libc/src/__support/CPP/limits.h b/libc/src/__support/CPP/limits.h
index 1ffde5f9556f..6440e8cb358f 100644
--- a/libc/src/__support/CPP/limits.h
+++ b/libc/src/__support/CPP/limits.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_CPP_LIMITS_H
 #define LLVM_LIBC_SRC___SUPPORT_CPP_LIMITS_H
 
-#include "include/llvm-libc-macros/limits-macros.h" // CHAR_BIT
+#include "llvm-libc-macros/limits-macros.h" // CHAR_BIT
 #include "src/__support/CPP/type_traits/is_integral.h"
 #include "src/__support/CPP/type_traits/is_signed.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
diff --git a/libc/src/__support/CPP/type_traits/is_fixed_point.h b/libc/src/__support/CPP/type_traits/is_fixed_point.h
index 317ba39748b7..e139e6477e2e 100644
--- a/libc/src/__support/CPP/type_traits/is_fixed_point.h
+++ b/libc/src/__support/CPP/type_traits/is_fixed_point.h
@@ -12,7 +12,7 @@
 #include "src/__support/CPP/type_traits/remove_cv.h"
 #include "src/__support/macros/attributes.h"
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE::cpp {
 
diff --git a/libc/src/__support/FPUtil/ManipulationFunctions.h b/libc/src/__support/FPUtil/ManipulationFunctions.h
index 9e760a28f42d..c1d57bd37c19 100644
--- a/libc/src/__support/FPUtil/ManipulationFunctions.h
+++ b/libc/src/__support/FPUtil/ManipulationFunctions.h
@@ -71,54 +71,80 @@ LIBC_INLINE T copysign(T x, T y) {
   return xbits.get_val();
 }
 
-template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
-LIBC_INLINE int ilogb(T x) {
-  // TODO: Raise appropriate floating point exceptions and set errno to the
-  // an appropriate error value wherever relevant.
-  FPBits<T> bits(x);
-  if (bits.is_zero()) {
-    return FP_ILOGB0;
-  } else if (bits.is_nan()) {
-    return FP_ILOGBNAN;
-  } else if (bits.is_inf()) {
-    return INT_MAX;
+template <typename T> struct IntLogbConstants;
+
+template <> struct IntLogbConstants<int> {
+  LIBC_INLINE_VAR static constexpr int FP_LOGB0 = FP_ILOGB0;
+  LIBC_INLINE_VAR static constexpr int FP_LOGBNAN = FP_ILOGBNAN;
+  LIBC_INLINE_VAR static constexpr int T_MAX = INT_MAX;
+  LIBC_INLINE_VAR static constexpr int T_MIN = INT_MIN;
+};
+
+template <> struct IntLogbConstants<long> {
+  LIBC_INLINE_VAR static constexpr long FP_LOGB0 = FP_ILOGB0;
+  LIBC_INLINE_VAR static constexpr long FP_LOGBNAN = FP_ILOGBNAN;
+  LIBC_INLINE_VAR static constexpr long T_MAX = LONG_MAX;
+  LIBC_INLINE_VAR static constexpr long T_MIN = LONG_MIN;
+};
+
+template <typename T, typename U>
+LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_floating_point_v<U>, T>
+intlogb(U x) {
+  FPBits<U> bits(x);
+  if (LIBC_UNLIKELY(bits.is_zero() || bits.is_inf_or_nan())) {
+    set_errno_if_required(EDOM);
+    raise_except_if_required(FE_INVALID);
+
+    if (bits.is_zero())
+      return IntLogbConstants<T>::FP_LOGB0;
+    if (bits.is_nan())
+      return IntLogbConstants<T>::FP_LOGBNAN;
+    // bits is inf.
+    return IntLogbConstants<T>::T_MAX;
   }
 
-  NormalFloat<T> normal(bits);
+  DyadicFloat<FPBits<U>::STORAGE_LEN> normal(bits.get_val());
+  int exponent = normal.get_unbiased_exponent();
   // The C standard does not specify the return value when an exponent is
   // out of int range. However, XSI conformance required that INT_MAX or
   // INT_MIN are returned.
   // NOTE: It is highly unlikely that exponent will be out of int range as
   // the exponent is only 15 bits wide even for the 128-bit floating point
   // format.
-  if (normal.exponent > INT_MAX)
-    return INT_MAX;
-  else if (normal.exponent < INT_MIN)
-    return INT_MIN;
-  else
-    return normal.exponent;
+  if (LIBC_UNLIKELY(exponent > IntLogbConstants<T>::T_MAX ||
+                    exponent < IntLogbConstants<T>::T_MIN)) {
+    set_errno_if_required(ERANGE);
+    raise_except_if_required(FE_INVALID);
+    return exponent > 0 ? IntLogbConstants<T>::T_MAX
+                        : IntLogbConstants<T>::T_MIN;
+  }
+
+  return static_cast<T>(exponent);
 }
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
-LIBC_INLINE T logb(T x) {
+LIBC_INLINE constexpr T logb(T x) {
   FPBits<T> bits(x);
-  if (bits.is_zero()) {
-    // TODO(Floating point exception): Raise div-by-zero exception.
-    // TODO(errno): POSIX requires setting errno to ERANGE.
-    return FPBits<T>::inf(Sign::NEG).get_val();
-  } else if (bits.is_nan()) {
-    return x;
-  } else if (bits.is_inf()) {
-    // Return positive infinity.
+  if (LIBC_UNLIKELY(bits.is_zero() || bits.is_inf_or_nan())) {
+    if (bits.is_nan())
+      return x;
+
+    raise_except_if_required(FE_DIVBYZERO);
+
+    if (bits.is_zero()) {
+      set_errno_if_required(ERANGE);
+      return FPBits<T>::inf(Sign::NEG).get_val();
+    }
+    // bits is inf.
     return FPBits<T>::inf().get_val();
   }
 
-  NormalFloat<T> normal(bits);
-  return static_cast<T>(normal.exponent);
+  DyadicFloat<FPBits<T>::STORAGE_LEN> normal(bits.get_val());
+  return static_cast<T>(normal.get_unbiased_exponent());
 }
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
-LIBC_INLINE T ldexp(T x, int exp) {
+LIBC_INLINE constexpr T ldexp(T x, int exp) {
   FPBits<T> bits(x);
   if (LIBC_UNLIKELY((exp == 0) || bits.is_zero() || bits.is_inf_or_nan()))
     return x;
diff --git a/libc/src/__support/FPUtil/dyadic_float.h b/libc/src/__support/FPUtil/dyadic_float.h
index 7797c57b96fd..14bc73433097 100644
--- a/libc/src/__support/FPUtil/dyadic_float.h
+++ b/libc/src/__support/FPUtil/dyadic_float.h
@@ -79,6 +79,11 @@ template <size_t Bits> struct DyadicFloat {
     return *this;
   }
 
+  // Assume that it is already normalized.  Output the unbiased exponent.
+  LIBC_INLINE constexpr int get_unbiased_exponent() const {
+    return exponent + (Bits - 1);
+  }
+
   // Assume that it is already normalized.
   // Output is rounded correctly with respect to the current rounding mode.
   template <typename T,
diff --git a/libc/src/__support/HashTable/table.h b/libc/src/__support/HashTable/table.h
index 5b4697e5245b..e2a26d0e2b5f 100644
--- a/libc/src/__support/HashTable/table.h
+++ b/libc/src/__support/HashTable/table.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_HASHTABLE_table_H
 #define LLVM_LIBC_SRC___SUPPORT_HASHTABLE_table_H
 
-#include "include/llvm-libc-types/ENTRY.h"
+#include "llvm-libc-types/ENTRY.h"
 #include "src/__support/CPP/bit.h" // bit_ceil
 #include "src/__support/CPP/new.h"
 #include "src/__support/HashTable/bitmask.h"
diff --git a/libc/src/__support/RPC/rpc_util.h b/libc/src/__support/RPC/rpc_util.h
index cc2a11a1108e..11d2f751355d 100644
--- a/libc/src/__support/RPC/rpc_util.h
+++ b/libc/src/__support/RPC/rpc_util.h
@@ -11,28 +11,15 @@
 
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/GPU/utils.h"
-#include "src/__support/macros/attributes.h" // LIBC_INLINE
+#include "src/__support/macros/attributes.h"
 #include "src/__support/macros/properties/architectures.h"
+#include "src/__support/threads/sleep.h"
 #include "src/string/memory_utils/generic/byte_per_byte.h"
 #include "src/string/memory_utils/inline_memcpy.h"
 
 namespace LIBC_NAMESPACE {
 namespace rpc {
 
-/// Suspend the thread briefly to assist the thread scheduler during busy loops.
-LIBC_INLINE void sleep_briefly() {
-#if defined(LIBC_TARGET_ARCH_IS_NVPTX)
-  if (__nvvm_reflect("__CUDA_ARCH") >= 700)
-    LIBC_INLINE_ASM("nanosleep.u32 64;" ::: "memory");
-#elif defined(LIBC_TARGET_ARCH_IS_AMDGPU)
-  __builtin_amdgcn_s_sleep(2);
-#elif defined(LIBC_TARGET_ARCH_IS_X86)
-  __builtin_ia32_pause();
-#else
-  // Simply do nothing if sleeping isn't supported on this platform.
-#endif
-}
-
 /// Conditional to indicate if this process is running on the GPU.
 LIBC_INLINE constexpr bool is_process_gpu() {
 #if defined(LIBC_TARGET_ARCH_IS_GPU)
diff --git a/libc/src/__support/fixed_point/CMakeLists.txt b/libc/src/__support/fixed_point/CMakeLists.txt
index 64f9dacc7ba5..0ed118f24088 100644
--- a/libc/src/__support/fixed_point/CMakeLists.txt
+++ b/libc/src/__support/fixed_point/CMakeLists.txt
@@ -21,3 +21,16 @@ add_header_library(
     libc.src.__support.CPP.bit
     libc.src.__support.math_extras
 )
+
+add_header_library(
+  sqrt
+  HDRS
+    sqrt.h
+  DEPENDS
+    .fx_rep
+    libc.include.llvm-libc-macros.stdfix_macros
+    libc.src.__support.macros.attributes
+    libc.src.__support.macros.optimization
+    libc.src.__support.CPP.bit
+    libc.src.__support.CPP.type_traits
+)
diff --git a/libc/src/__support/fixed_point/fx_bits.h b/libc/src/__support/fixed_point/fx_bits.h
index fcd47cd72cbb..0c8d03beb84a 100644
--- a/libc/src/__support/fixed_point/fx_bits.h
+++ b/libc/src/__support/fixed_point/fx_bits.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_FXBITS_H
 #define LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_FXBITS_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/macros/attributes.h"   // LIBC_INLINE
diff --git a/libc/src/__support/fixed_point/fx_rep.h b/libc/src/__support/fixed_point/fx_rep.h
index fcd7554c4d85..7d18f14a8c48 100644
--- a/libc/src/__support/fixed_point/fx_rep.h
+++ b/libc/src/__support/fixed_point/fx_rep.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_FXREP_H
 #define LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_FXREP_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE, LIBC_INLINE_VAR
 
diff --git a/libc/src/__support/fixed_point/sqrt.h b/libc/src/__support/fixed_point/sqrt.h
new file mode 100644
index 000000000000..236ebb285703
--- /dev/null
+++ b/libc/src/__support/fixed_point/sqrt.h
@@ -0,0 +1,129 @@
+//===-- Calculate square root of fixed point numbers. -----*- C++ -*-=========//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_SQRT_H
+#define LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_SQRT_H
+
+#include "llvm-libc-macros/stdfix-macros.h"
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/type_traits.h"
+#include "src/__support/macros/attributes.h"   // LIBC_INLINE
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+#include "fx_rep.h"
+
+#ifdef LIBC_COMPILER_HAS_FIXED_POINT
+
+namespace LIBC_NAMESPACE::fixed_point {
+
+namespace internal {
+
+template <typename T> struct SqrtConfig;
+
+template <> struct SqrtConfig<unsigned short fract> {
+  using Type = unsigned short fract;
+  static constexpr int EXTRA_STEPS = 0;
+};
+
+template <> struct SqrtConfig<unsigned fract> {
+  using Type = unsigned fract;
+  static constexpr int EXTRA_STEPS = 1;
+};
+
+template <> struct SqrtConfig<unsigned long fract> {
+  using Type = unsigned long fract;
+  static constexpr int EXTRA_STEPS = 2;
+};
+
+template <>
+struct SqrtConfig<unsigned short accum> : SqrtConfig<unsigned fract> {};
+
+template <>
+struct SqrtConfig<unsigned accum> : SqrtConfig<unsigned long fract> {};
+
+// TODO: unsigned long accum type is 64-bit, and will need 64-bit fract type.
+// Probably we will use DyadicFloat<64> for intermediate computations instead.
+
+// Linear approximation for the initial values, with errors bounded by:
+//   max(1.5 * 2^-11, eps)
+// Generated with Sollya:
+// > for i from 4 to 15 do {
+//     P = fpminimax(sqrt(x), 1, [|8, 8|], [i * 2^-4, (i + 1)*2^-4],
+//                   fixed, absolute);
+//     print("{", coeff(P, 1), "uhr,", coeff(P, 0), "uhr},");
+//   };
+static constexpr unsigned short fract SQRT_FIRST_APPROX[12][2] = {
+    {0x1.e8p-1uhr, 0x1.0cp-2uhr}, {0x1.bap-1uhr, 0x1.28p-2uhr},
+    {0x1.94p-1uhr, 0x1.44p-2uhr}, {0x1.74p-1uhr, 0x1.6p-2uhr},
+    {0x1.6p-1uhr, 0x1.74p-2uhr},  {0x1.4ep-1uhr, 0x1.88p-2uhr},
+    {0x1.3ep-1uhr, 0x1.9cp-2uhr}, {0x1.32p-1uhr, 0x1.acp-2uhr},
+    {0x1.22p-1uhr, 0x1.c4p-2uhr}, {0x1.18p-1uhr, 0x1.d4p-2uhr},
+    {0x1.08p-1uhr, 0x1.fp-2uhr},  {0x1.04p-1uhr, 0x1.f8p-2uhr},
+};
+
+} // namespace internal
+
+template <typename T>
+LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_fixed_point_v<T>, T> sqrt(T x) {
+  using BitType = typename FXRep<T>::StorageType;
+  BitType x_bit = cpp::bit_cast<BitType>(x);
+
+  if (LIBC_UNLIKELY(x_bit == 0))
+    return FXRep<T>::ZERO();
+
+  int leading_zeros = cpp::countl_zero(x_bit);
+  constexpr int STORAGE_LENGTH = sizeof(BitType) * CHAR_BIT;
+  constexpr int EXP_ADJUSTMENT = STORAGE_LENGTH - FXRep<T>::FRACTION_LEN - 1;
+  // x_exp is the real exponent of the leading bit of x.
+  int x_exp = EXP_ADJUSTMENT - leading_zeros;
+  int shift = EXP_ADJUSTMENT - 1 - (x_exp & (~1));
+  // Normalize.
+  x_bit <<= shift;
+  using FracType = typename internal::SqrtConfig<T>::Type;
+  FracType x_frac = cpp::bit_cast<FracType>(x_bit);
+
+  // Use use Newton method to approximate sqrt(a):
+  //   x_{n + 1} = 1/2 (x_n + a / x_n)
+  // For the initial values, we choose x_0
+
+  // Use the leading 4 bits to do look up for sqrt(x).
+  // After normalization, 0.25 <= x_frac < 1, so the leading 4 bits of x_frac
+  // are between 0b0100 and 0b1111.  Hence the lookup table only needs 12
+  // entries, and we can get the index by subtracting the leading 4 bits of
+  // x_frac by 4 = 0b0100.
+  int index = (x_bit >> (STORAGE_LENGTH - 4)) - 4;
+  FracType a = static_cast<FracType>(internal::SQRT_FIRST_APPROX[index][0]);
+  FracType b = static_cast<FracType>(internal::SQRT_FIRST_APPROX[index][1]);
+
+  // Initial approximation step.
+  // Estimated error bounds: | r - sqrt(x_frac) | < max(1.5 * 2^-11, eps).
+  FracType r = a * x_frac + b;
+
+  // Further Newton-method iterations for square-root:
+  //   x_{n + 1} = 0.5 * (x_n + a / x_n)
+  // We distribute and do the multiplication by 0.5 first to avoid overflow.
+  // TODO: Investigate the performance and accuracy of using division-free
+  // iterations from:
+  //   Blanchard, J. D. and Chamberland, M., "Newton's Method Without Division",
+  //   The American Mathematical Monthly (2023).
+  //   https://chamberland.math.grinnell.edu/papers/newton.pdf
+  for (int i = 0; i < internal::SqrtConfig<T>::EXTRA_STEPS; ++i)
+    r = (r >> 1) + (x_frac >> 1) / r;
+
+  // Re-scaling
+  r >>= EXP_ADJUSTMENT - (x_exp >> 1);
+
+  // Return result.
+  return cpp::bit_cast<T>(r);
+}
+
+} // namespace LIBC_NAMESPACE::fixed_point
+
+#endif // LIBC_COMPILER_HAS_FIXED_POINT
+
+#endif // LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_SQRT_H
diff --git a/libc/src/__support/macros/properties/float.h b/libc/src/__support/macros/properties/float.h
index 08a1ab726cbd..510f39237493 100644
--- a/libc/src/__support/macros/properties/float.h
+++ b/libc/src/__support/macros/properties/float.h
@@ -11,8 +11,8 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_FLOAT_H
 #define LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_FLOAT_H
 
-#include "include/llvm-libc-macros/float-macros.h" // LDBL_MANT_DIG
-#include "include/llvm-libc-types/float128.h"      // float128
+#include "llvm-libc-macros/float-macros.h" // LDBL_MANT_DIG
+#include "llvm-libc-types/float128.h"      // float128
 #include "src/__support/macros/properties/architectures.h"
 #include "src/__support/macros/properties/compiler.h"
 #include "src/__support/macros/properties/cpu_features.h"
diff --git a/libc/src/__support/memory_size.h b/libc/src/__support/memory_size.h
index 4c7d2079553e..94aee2520afa 100644
--- a/libc/src/__support/memory_size.h
+++ b/libc/src/__support/memory_size.h
@@ -52,9 +52,11 @@ public:
 
   LIBC_INLINE SafeMemSize operator+(const SafeMemSize &other) {
     type result;
-    if (LIBC_UNLIKELY((value | other.value) < 0))
+    if (LIBC_UNLIKELY((value | other.value) < 0)) {
       result = -1;
-    result = value + other.value;
+    } else {
+      result = value + other.value;
+    }
     return SafeMemSize{result};
   }
 
diff --git a/libc/src/__support/threads/CMakeLists.txt b/libc/src/__support/threads/CMakeLists.txt
index 0feeda0c179b..731adf6f9c8e 100644
--- a/libc/src/__support/threads/CMakeLists.txt
+++ b/libc/src/__support/threads/CMakeLists.txt
@@ -4,6 +4,12 @@ add_header_library(
     mutex_common.h
 )
 
+add_header_library(
+  sleep
+  HDRS
+    sleep.h
+)
+
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
   add_subdirectory(${LIBC_TARGET_OS})
 endif()
diff --git a/libc/src/__support/threads/sleep.h b/libc/src/__support/threads/sleep.h
new file mode 100644
index 000000000000..9a2dff598ece
--- /dev/null
+++ b/libc/src/__support/threads/sleep.h
@@ -0,0 +1,34 @@
+//===-- Utilities for suspending threads ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_THREADS_SLEEP_H
+#define LLVM_LIBC_SRC___SUPPORT_THREADS_SLEEP_H
+
+#include "src/__support/macros/attributes.h"
+
+namespace LIBC_NAMESPACE {
+
+/// Suspend the thread briefly to assist the thread scheduler during busy loops.
+LIBC_INLINE void sleep_briefly() {
+#if defined(LIBC_TARGET_ARCH_IS_NVPTX)
+  if (__nvvm_reflect("__CUDA_ARCH") >= 700)
+    LIBC_INLINE_ASM("nanosleep.u32 64;" ::: "memory");
+#elif defined(LIBC_TARGET_ARCH_IS_AMDGPU)
+  __builtin_amdgcn_s_sleep(2);
+#elif defined(LIBC_TARGET_ARCH_IS_X86)
+  __builtin_ia32_pause();
+#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
+  __builtin_arm_isb(0xf);
+#else
+  // Simply do nothing if sleeping isn't supported on this platform.
+#endif
+}
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC___SUPPORT_THREADS_SLEEP_H
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 33dc1fc97c56..882befd9f7e7 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -1,9 +1,6 @@
 add_subdirectory(generic)
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
   add_subdirectory(${LIBC_TARGET_ARCHITECTURE})
-elseif(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
-  # TODO: We should split this into 'nvptx' and 'amdgpu' for the GPU build.
-  add_subdirectory(${LIBC_TARGET_OS})
 endif()
 
 function(add_math_entrypoint_object name)
@@ -11,7 +8,6 @@ function(add_math_entrypoint_object name)
   # that first and return early if we are able to add an alias target for the
   # machine specific implementation.
   get_fq_target_name("${LIBC_TARGET_ARCHITECTURE}.${name}" fq_machine_specific_target_name)
-  get_fq_target_name("${LIBC_TARGET_OS}.${name}" fq_os_specific_target_name)
   if(TARGET ${fq_machine_specific_target_name})
     add_entrypoint_object(
       ${name}
@@ -20,28 +16,6 @@ function(add_math_entrypoint_object name)
         .${LIBC_TARGET_ARCHITECTURE}.${name}
     )
     return()
-  elseif(TARGET ${fq_os_specific_target_name})
-    add_entrypoint_object(
-      ${name}
-      ALIAS
-      DEPENDS
-        .${LIBC_TARGET_OS}.${name}
-    )
-    return()
-  endif()
-
-  # The GPU optionally depends on vendor libraries. If we emitted one of these
-  # entrypoints it means the user requested it and we should use it instead.
-  get_fq_target_name("${LIBC_TARGET_OS}.vendor.${name}" fq_vendor_specific_target_name)
-  if(TARGET ${fq_vendor_specific_target_name})
-    add_entrypoint_object(
-      ${name}
-      ALIAS
-      DEPENDS
-        .${LIBC_TARGET_OS}.vendor.${name}
-      VENDOR
-    )
-    return()
   endif()
 
   get_fq_target_name("generic.${name}" fq_generic_target_name)
@@ -157,6 +131,12 @@ add_math_entrypoint_object(hypotf)
 add_math_entrypoint_object(ilogb)
 add_math_entrypoint_object(ilogbf)
 add_math_entrypoint_object(ilogbl)
+add_math_entrypoint_object(ilogbf128)
+
+add_math_entrypoint_object(llogb)
+add_math_entrypoint_object(llogbf)
+add_math_entrypoint_object(llogbl)
+add_math_entrypoint_object(llogbf128)
 
 add_math_entrypoint_object(ldexp)
 add_math_entrypoint_object(ldexpf)
@@ -178,6 +158,7 @@ add_math_entrypoint_object(logf)
 add_math_entrypoint_object(logb)
 add_math_entrypoint_object(logbf)
 add_math_entrypoint_object(logbl)
+add_math_entrypoint_object(logbf128)
 
 add_math_entrypoint_object(llrint)
 add_math_entrypoint_object(llrintf)
diff --git a/libc/src/math/gpu/vendor/CMakeLists.txt b/libc/src/math/amdgpu/CMakeLists.txt
index 36087ade63bf..cb77341aa505 100644
--- a/libc/src/math/gpu/vendor/CMakeLists.txt
+++ b/libc/src/math/amdgpu/CMakeLists.txt
@@ -1,39 +1,360 @@
+# Math functions not yet available in the libc project, or those not yet tuned
+# for GPU workloads are provided as wrappers over vendor libraries. If we find
+# them ahead of time we will import them statically. Otherwise, we will keep
+# them as external references and expect them to be resolved by the user when
+# they compile. In the future,we will use implementations from the 'libc'
+# project and not provide these wrappers.
 find_package(AMDDeviceLibs QUIET HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
 if(AMDDeviceLibs_FOUND)
   message(STATUS "Found the ROCm device library. Implementations falling back "
                  "to the vendor libraries will be resolved statically.")
   get_target_property(ocml_path ocml IMPORTED_LOCATION)
-  list(APPEND bitcode_link_flags
-       "SHELL:-Xclang -mlink-builtin-bitcode -Xclang ${ocml_path}")
+  set(bitcode_link_flags
+      "SHELL:-Xclang -mlink-builtin-bitcode -Xclang ${ocml_path}")
 else()
   message(STATUS "Could not find the ROCm device library. Unimplemented "
                  "functions will be an external reference to the vendor libraries.")
 endif()
 
-if(CUDAToolkit_FOUND)
-  set(libdevice_path ${CUDAToolkit_BIN_DIR}/../nvvm/libdevice/libdevice.10.bc)
-  if (EXISTS ${libdevice_path})
-    message(STATUS "Found the CUDA device library. Implementations falling back "
-                   "to the vendor libraries will be resolved statically.")
-    list(APPEND bitcode_link_flags
-         "SHELL:-Xclang -mlink-builtin-bitcode -Xclang ${libdevice_path}")
-  endif()
-else()
-  message(STATUS "Could not find the CUDA device library. Unimplemented "
-                 "functions will be an external reference to the vendor libraries.")
-endif()
+add_entrypoint_object(
+  ceil
+  SRCS
+    ceil.cpp
+  HDRS
+    ../ceil.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  ceilf
+  SRCS
+    ceilf.cpp
+  HDRS
+    ../ceilf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  copysign
+  SRCS
+    copysign.cpp
+  HDRS
+    ../copysign.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  copysignf
+  SRCS
+    copysignf.cpp
+  HDRS
+    ../copysignf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fabs
+  SRCS
+    fabs.cpp
+  HDRS
+    ../fabs.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fabsf
+  SRCS
+    fabsf.cpp
+  HDRS
+    ../fabsf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  floor
+  SRCS
+    floor.cpp
+  HDRS
+    ../floor.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  floorf
+  SRCS
+    floorf.cpp
+  HDRS
+    ../floorf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fma
+  SRCS
+    fma.cpp
+  HDRS
+    ../fma.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmaf
+  SRCS
+    fmaf.cpp
+  HDRS
+    ../fmaf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmax
+  SRCS
+    fmax.cpp
+  HDRS
+    ../fmax.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmaxf
+  SRCS
+    fmaxf.cpp
+  HDRS
+    ../fmaxf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmin
+  SRCS
+    fmin.cpp
+  HDRS
+    ../fmin.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fminf
+  SRCS
+    fminf.cpp
+  HDRS
+    ../fminf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmod
+  SRCS
+    fmod.cpp
+  HDRS
+    ../fmod.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmodf
+  SRCS
+    fmodf.cpp
+  HDRS
+    ../fmodf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  lround
+  SRCS
+    lround.cpp
+  HDRS
+    ../lround.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  lroundf
+  SRCS
+    lroundf.cpp
+  HDRS
+    ../lroundf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  llround
+  SRCS
+    llround.cpp
+  HDRS
+    ../llround.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  llroundf
+  SRCS
+    llroundf.cpp
+  HDRS
+    ../llroundf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  modf
+  SRCS
+    modf.cpp
+  HDRS
+    ../modf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  modff
+  SRCS
+    modff.cpp
+  HDRS
+    ../modff.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  nearbyint
+  SRCS
+    nearbyint.cpp
+  HDRS
+    ../nearbyint.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  nearbyintf
+  SRCS
+    nearbyintf.cpp
+  HDRS
+    ../nearbyintf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  remainder
+  SRCS
+    remainder.cpp
+  HDRS
+    ../remainder.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  remainderf
+  SRCS
+    remainderf.cpp
+  HDRS
+    ../remainderf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  rint
+  SRCS
+    rint.cpp
+  HDRS
+    ../rint.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  rintf
+  SRCS
+    rintf.cpp
+  HDRS
+    ../rintf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  round
+  SRCS
+    round.cpp
+  HDRS
+    ../round.h
+  COMPILE_OPTIONS
+    -O2
+)
 
-# FIXME: We need a way to pass the library to only the NVTPX / AMDGPU build.
-# This shouldn't cause issues because we only link in needed symbols, but it
-# will link in identity metadata from both libraries. This silences the warning.
-list(APPEND bitcode_link_flags "-Wno-linker-warnings")
+add_entrypoint_object(
+  sqrt
+  SRCS
+    sqrt.cpp
+  HDRS
+    ../sqrt.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  sqrtf
+  SRCS
+    sqrtf.cpp
+  HDRS
+    ../sqrtf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  trunc
+  SRCS
+    trunc.cpp
+  HDRS
+    ../trunc.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  truncf
+  SRCS
+    truncf.cpp
+  HDRS
+    ../truncf.h
+  COMPILE_OPTIONS
+    -O2
+)
 
+# The following functions currently are not implemented natively and borrow from
+# existing implementations. This will be removed in the future.
 add_entrypoint_object(
   acos
   SRCS
     acos.cpp
   HDRS
-    ../../acos.h
+    ../acos.h
+  VENDOR
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
@@ -44,10 +365,11 @@ add_entrypoint_object(
   SRCS
     acosf.cpp
   HDRS
-    ../../acosf.h
+    ../acosf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -55,10 +377,11 @@ add_entrypoint_object(
   SRCS
     acosh.cpp
   HDRS
-    ../../acosh.h
+    ../acosh.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -66,10 +389,11 @@ add_entrypoint_object(
   SRCS
     acoshf.cpp
   HDRS
-    ../../acoshf.h
+    ../acoshf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -77,10 +401,11 @@ add_entrypoint_object(
   SRCS
     asin.cpp
   HDRS
-    ../../asin.h
+    ../asin.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -88,10 +413,11 @@ add_entrypoint_object(
   SRCS
     asinf.cpp
   HDRS
-    ../../asinf.h
+    ../asinf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -99,10 +425,11 @@ add_entrypoint_object(
   SRCS
     asinh.cpp
   HDRS
-    ../../asinh.h
+    ../asinh.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -110,10 +437,11 @@ add_entrypoint_object(
   SRCS
     atan.cpp
   HDRS
-    ../../atan.h
+    ../atan.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -121,10 +449,11 @@ add_entrypoint_object(
   SRCS
     atanf.cpp
   HDRS
-    ../../atanf.h
+    ../atanf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -132,10 +461,11 @@ add_entrypoint_object(
   SRCS
     atan2.cpp
   HDRS
-    ../../atan2.h
+    ../atan2.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -143,10 +473,11 @@ add_entrypoint_object(
   SRCS
     atan2f.cpp
   HDRS
-    ../../atan2f.h
+    ../atan2f.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -154,10 +485,11 @@ add_entrypoint_object(
   SRCS
     atanh.cpp
   HDRS
-    ../../atanh.h
+    ../atanh.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -165,10 +497,11 @@ add_entrypoint_object(
   SRCS
     atanhf.cpp
   HDRS
-    ../../atanhf.h
+    ../atanhf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -176,10 +509,11 @@ add_entrypoint_object(
   SRCS
     cos.cpp
   HDRS
-    ../../cos.h
+    ../cos.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -187,10 +521,11 @@ add_entrypoint_object(
   SRCS
     cosf.cpp
   HDRS
-    ../../cosf.h
+    ../cosf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -198,10 +533,11 @@ add_entrypoint_object(
   SRCS
     cosh.cpp
   HDRS
-    ../../cosh.h
+    ../cosh.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -209,10 +545,11 @@ add_entrypoint_object(
   SRCS
     coshf.cpp
   HDRS
-    ../../coshf.h
+    ../coshf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -220,10 +557,11 @@ add_entrypoint_object(
   SRCS
     erf.cpp
   HDRS
-    ../../erf.h
+    ../erf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -231,10 +569,11 @@ add_entrypoint_object(
   SRCS
     erff.cpp
   HDRS
-    ../../erff.h
+    ../erff.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -242,10 +581,11 @@ add_entrypoint_object(
   SRCS
     exp.cpp
   HDRS
-    ../../exp.h
+    ../exp.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -253,10 +593,11 @@ add_entrypoint_object(
   SRCS
     exp10.cpp
   HDRS
-    ../../exp10.h
+    ../exp10.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -264,10 +605,11 @@ add_entrypoint_object(
   SRCS
     exp10f.cpp
   HDRS
-    ../../exp10f.h
+    ../exp10f.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -275,10 +617,11 @@ add_entrypoint_object(
   SRCS
     exp2.cpp
   HDRS
-    ../../exp2.h
+    ../exp2.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -286,10 +629,11 @@ add_entrypoint_object(
   SRCS
     exp2f.cpp
   HDRS
-    ../../exp2f.h
+    ../exp2f.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -297,10 +641,11 @@ add_entrypoint_object(
   SRCS
     expf.cpp
   HDRS
-    ../../expf.h
+    ../expf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -308,10 +653,11 @@ add_entrypoint_object(
   SRCS
     expm1.cpp
   HDRS
-    ../../expm1.h
+    ../expm1.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -319,10 +665,11 @@ add_entrypoint_object(
   SRCS
     expm1f.cpp
   HDRS
-    ../../expm1f.h
+    ../expm1f.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -330,10 +677,11 @@ add_entrypoint_object(
   SRCS
     fdim.cpp
   HDRS
-    ../../fdim.h
+    ../fdim.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -341,10 +689,11 @@ add_entrypoint_object(
   SRCS
     fdimf.cpp
   HDRS
-    ../../fdimf.h
+    ../fdimf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -352,10 +701,11 @@ add_entrypoint_object(
   SRCS
     hypot.cpp
   HDRS
-    ../../hypot.h
+    ../hypot.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -363,10 +713,11 @@ add_entrypoint_object(
   SRCS
     hypotf.cpp
   HDRS
-    ../../hypotf.h
+    ../hypotf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -374,10 +725,11 @@ add_entrypoint_object(
   SRCS
     ilogb.cpp
   HDRS
-    ../../ilogb.h
+    ../ilogb.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -385,10 +737,11 @@ add_entrypoint_object(
   SRCS
     ilogbf.cpp
   HDRS
-    ../../ilogbf.h
+    ../ilogbf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -396,10 +749,11 @@ add_entrypoint_object(
   SRCS
     log10.cpp
   HDRS
-    ../../log10.h
+    ../log10.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -407,10 +761,11 @@ add_entrypoint_object(
   SRCS
     log10f.cpp
   HDRS
-    ../../log10f.h
+    ../log10f.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -418,10 +773,11 @@ add_entrypoint_object(
   SRCS
     log2.cpp
   HDRS
-    ../../log2.h
+    ../log2.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -429,10 +785,11 @@ add_entrypoint_object(
   SRCS
     log2f.cpp
   HDRS
-    ../../log2f.h
+    ../log2f.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -440,10 +797,11 @@ add_entrypoint_object(
   SRCS
     log.cpp
   HDRS
-    ../../log.h
+    ../log.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -451,10 +809,11 @@ add_entrypoint_object(
   SRCS
     logf.cpp
   HDRS
-    ../../logf.h
+    ../logf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -462,10 +821,11 @@ add_entrypoint_object(
   SRCS
     lrint.cpp
   HDRS
-    ../../lrint.h
+    ../lrint.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -473,10 +833,11 @@ add_entrypoint_object(
   SRCS
     lrintf.cpp
   HDRS
-    ../../lrintf.h
+    ../lrintf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -484,10 +845,11 @@ add_entrypoint_object(
   SRCS
     ldexp.cpp
   HDRS
-    ../../ldexp.h
+    ../ldexp.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -495,10 +857,11 @@ add_entrypoint_object(
   SRCS
     ldexpf.cpp
   HDRS
-    ../../ldexpf.h
+    ../ldexpf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -506,10 +869,11 @@ add_entrypoint_object(
   SRCS
     log1p.cpp
   HDRS
-    ../../log1p.h
+    ../log1p.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -517,10 +881,11 @@ add_entrypoint_object(
   SRCS
     log1pf.cpp
   HDRS
-    ../../log1pf.h
+    ../log1pf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -528,10 +893,11 @@ add_entrypoint_object(
   SRCS
     llrint.cpp
   HDRS
-    ../../llrint.h
+    ../llrint.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -539,10 +905,11 @@ add_entrypoint_object(
   SRCS
     llrintf.cpp
   HDRS
-    ../../llrintf.h
+    ../llrintf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -550,10 +917,11 @@ add_entrypoint_object(
   SRCS
     remquo.cpp
   HDRS
-    ../../remquo.h
+    ../remquo.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -561,10 +929,11 @@ add_entrypoint_object(
   SRCS
     remquof.cpp
   HDRS
-    ../../remquof.h
+    ../remquof.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -572,10 +941,11 @@ add_entrypoint_object(
   SRCS
     scalbn.cpp
   HDRS
-    ../../scalbn.h
+    ../scalbn.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -583,10 +953,11 @@ add_entrypoint_object(
   SRCS
     scalbnf.cpp
   HDRS
-    ../../scalbnf.h
+    ../scalbnf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 
@@ -595,10 +966,11 @@ add_entrypoint_object(
   SRCS
     nextafter.cpp
   HDRS
-    ../../nextafter.h
+    ../nextafter.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -606,10 +978,11 @@ add_entrypoint_object(
   SRCS
     nextafterf.cpp
   HDRS
-    ../../nextafterf.h
+    ../nextafterf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -617,10 +990,11 @@ add_entrypoint_object(
   SRCS
     pow.cpp
   HDRS
-    ../../pow.h
+    ../pow.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -628,10 +1002,11 @@ add_entrypoint_object(
   SRCS
     powf.cpp
   HDRS
-    ../../powf.h
+    ../powf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -639,10 +1014,11 @@ add_entrypoint_object(
   SRCS
     sin.cpp
   HDRS
-    ../../sin.h
+    ../sin.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -650,10 +1026,11 @@ add_entrypoint_object(
   SRCS
     sinf.cpp
   HDRS
-    ../../sinf.h
+    ../sinf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -661,10 +1038,11 @@ add_entrypoint_object(
   SRCS
     sincos.cpp
   HDRS
-    ../../sincos.h
+    ../sincos.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -672,10 +1050,11 @@ add_entrypoint_object(
   SRCS
     sincosf.cpp
   HDRS
-    ../../sincosf.h
+    ../sincosf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -683,10 +1062,11 @@ add_entrypoint_object(
   SRCS
     sinh.cpp
   HDRS
-    ../../sinh.h
+    ../sinh.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -694,10 +1074,11 @@ add_entrypoint_object(
   SRCS
     sinhf.cpp
   HDRS
-    ../../sinhf.h
+    ../sinhf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -705,10 +1086,11 @@ add_entrypoint_object(
   SRCS
     tan.cpp
   HDRS
-    ../../tan.h
+    ../tan.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -716,10 +1098,11 @@ add_entrypoint_object(
   SRCS
     tanf.cpp
   HDRS
-    ../../tanf.h
+    ../tanf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -727,10 +1110,11 @@ add_entrypoint_object(
   SRCS
     tanh.cpp
   HDRS
-    ../../tanh.h
+    ../tanh.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -738,10 +1122,11 @@ add_entrypoint_object(
   SRCS
     tanhf.cpp
   HDRS
-    ../../tanhf.h
+    ../tanhf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -749,10 +1134,11 @@ add_entrypoint_object(
   SRCS
     tgamma.cpp
   HDRS
-    ../../tgamma.h
+    ../tgamma.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -760,10 +1146,11 @@ add_entrypoint_object(
   SRCS
     tgammaf.cpp
   HDRS
-    ../../tgammaf.h
+    ../tgammaf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -771,10 +1158,11 @@ add_entrypoint_object(
   SRCS
     frexp.cpp
   HDRS
-    ../../frexp.h
+    ../frexp.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -782,8 +1170,9 @@ add_entrypoint_object(
   SRCS
     frexpf.cpp
   HDRS
-    ../../frexpf.h
+    ../frexpf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
diff --git a/libc/src/math/amdgpu/acos.cpp b/libc/src/math/amdgpu/acos.cpp
new file mode 100644
index 000000000000..b1e30fef82de
--- /dev/null
+++ b/libc/src/math/amdgpu/acos.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU acos function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/acos.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, acos, (double x)) { return __ocml_acos_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/acosf.cpp b/libc/src/math/amdgpu/acosf.cpp
new file mode 100644
index 000000000000..4c2dd4bcf435
--- /dev/null
+++ b/libc/src/math/amdgpu/acosf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the acosf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/acosf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, acosf, (float x)) { return __ocml_acos_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/acosh.cpp b/libc/src/math/amdgpu/acosh.cpp
new file mode 100644
index 000000000000..dcdeeab29454
--- /dev/null
+++ b/libc/src/math/amdgpu/acosh.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU acosh function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/acosh.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, acosh, (double x)) { return __ocml_acosh_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/acoshf.cpp b/libc/src/math/amdgpu/acoshf.cpp
new file mode 100644
index 000000000000..52baa2eaecc7
--- /dev/null
+++ b/libc/src/math/amdgpu/acoshf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the acoshf function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/acoshf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, acoshf, (float x)) { return __ocml_acosh_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/asin.cpp b/libc/src/math/amdgpu/asin.cpp
new file mode 100644
index 000000000000..835c317112e2
--- /dev/null
+++ b/libc/src/math/amdgpu/asin.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU asin function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/asin.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, asin, (double x)) { return __ocml_asin_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/asinf.cpp b/libc/src/math/amdgpu/asinf.cpp
new file mode 100644
index 000000000000..72c45d5cf172
--- /dev/null
+++ b/libc/src/math/amdgpu/asinf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the asinf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/asinf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, asinf, (float x)) { return __ocml_asin_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/asinh.cpp b/libc/src/math/amdgpu/asinh.cpp
new file mode 100644
index 000000000000..7a9f7ea4e988
--- /dev/null
+++ b/libc/src/math/amdgpu/asinh.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU asinh function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/asinh.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, asinh, (double x)) { return __ocml_asinh_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/asinhf.cpp b/libc/src/math/amdgpu/asinhf.cpp
new file mode 100644
index 000000000000..28d6bde5c918
--- /dev/null
+++ b/libc/src/math/amdgpu/asinhf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the asinhf function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/asinhf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, asinhf, (float x)) { return __ocml_asinh_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/atan.cpp b/libc/src/math/amdgpu/atan.cpp
new file mode 100644
index 000000000000..a1fa38ba451d
--- /dev/null
+++ b/libc/src/math/amdgpu/atan.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU atan function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/atan.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, atan, (double x)) { return __ocml_atan_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/atan2.cpp b/libc/src/math/amdgpu/atan2.cpp
new file mode 100644
index 000000000000..9cfdba75eb8d
--- /dev/null
+++ b/libc/src/math/amdgpu/atan2.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the GPU atan2 function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/atan2.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, atan2, (double x, double y)) {
+  return __ocml_atan2_f64(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/atan2f.cpp b/libc/src/math/amdgpu/atan2f.cpp
new file mode 100644
index 000000000000..ef56293b4caf
--- /dev/null
+++ b/libc/src/math/amdgpu/atan2f.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the GPU atan2f function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/atan2f.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, atan2f, (float x, float y)) {
+  return __ocml_atan2_f32(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/atanf.cpp b/libc/src/math/amdgpu/atanf.cpp
new file mode 100644
index 000000000000..bbcceca3ed09
--- /dev/null
+++ b/libc/src/math/amdgpu/atanf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the atanf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/atanf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, atanf, (float x)) { return __ocml_atan_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/atanh.cpp b/libc/src/math/amdgpu/atanh.cpp
new file mode 100644
index 000000000000..ec462586450b
--- /dev/null
+++ b/libc/src/math/amdgpu/atanh.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU atanh function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/atanh.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, atanh, (double x)) { return __ocml_atanh_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/atanhf.cpp b/libc/src/math/amdgpu/atanhf.cpp
new file mode 100644
index 000000000000..227269369ab6
--- /dev/null
+++ b/libc/src/math/amdgpu/atanhf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the atanhf function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/atanhf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, atanhf, (float x)) { return __ocml_atanh_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/ceil.cpp b/libc/src/math/amdgpu/ceil.cpp
index ad1407d61f62..ad1407d61f62 100644
--- a/libc/src/math/gpu/ceil.cpp
+++ b/libc/src/math/amdgpu/ceil.cpp
diff --git a/libc/src/math/gpu/ceilf.cpp b/libc/src/math/amdgpu/ceilf.cpp
index c4fc58d93603..c4fc58d93603 100644
--- a/libc/src/math/gpu/ceilf.cpp
+++ b/libc/src/math/amdgpu/ceilf.cpp
diff --git a/libc/src/math/gpu/copysign.cpp b/libc/src/math/amdgpu/copysign.cpp
index 6f804bdb90a1..6f804bdb90a1 100644
--- a/libc/src/math/gpu/copysign.cpp
+++ b/libc/src/math/amdgpu/copysign.cpp
diff --git a/libc/src/math/gpu/copysignf.cpp b/libc/src/math/amdgpu/copysignf.cpp
index 4d7e132462ac..4d7e132462ac 100644
--- a/libc/src/math/gpu/copysignf.cpp
+++ b/libc/src/math/amdgpu/copysignf.cpp
diff --git a/libc/src/math/amdgpu/cos.cpp b/libc/src/math/amdgpu/cos.cpp
new file mode 100644
index 000000000000..68239d933780
--- /dev/null
+++ b/libc/src/math/amdgpu/cos.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the cos function for GPU ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/cos.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, cos, (double x)) { return __ocml_cos_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/cosf.cpp b/libc/src/math/amdgpu/cosf.cpp
new file mode 100644
index 000000000000..a60e9ea28907
--- /dev/null
+++ b/libc/src/math/amdgpu/cosf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the cosf function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/cosf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, cosf, (float x)) { return __ocml_cos_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/cosh.cpp b/libc/src/math/amdgpu/cosh.cpp
new file mode 100644
index 000000000000..b71df0c0170c
--- /dev/null
+++ b/libc/src/math/amdgpu/cosh.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the cosh function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/cosh.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, cosh, (double x)) { return __ocml_cosh_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/coshf.cpp b/libc/src/math/amdgpu/coshf.cpp
new file mode 100644
index 000000000000..699fb0478aee
--- /dev/null
+++ b/libc/src/math/amdgpu/coshf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the coshf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/coshf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, coshf, (float x)) { return __ocml_cosh_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/amdgpu/declarations.h b/libc/src/math/amdgpu/declarations.h
index 7a01cbc6ab19..780d5f0a1140 100644
--- a/libc/src/math/gpu/vendor/amdgpu/declarations.h
+++ b/libc/src/math/amdgpu/declarations.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_SRC_MATH_GPU_AMDGPU_DECLARATIONS_H
 #define LLVM_LIBC_SRC_MATH_GPU_AMDGPU_DECLARATIONS_H
 
+#include "platform.h"
+
 #include "src/__support/GPU/utils.h"
 
 namespace LIBC_NAMESPACE {
diff --git a/libc/src/math/amdgpu/erf.cpp b/libc/src/math/amdgpu/erf.cpp
new file mode 100644
index 000000000000..7a464550c7e0
--- /dev/null
+++ b/libc/src/math/amdgpu/erf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU erf function ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/erf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, erf, (double x)) { return __ocml_erf_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/erff.cpp b/libc/src/math/amdgpu/erff.cpp
new file mode 100644
index 000000000000..1f77d08585a3
--- /dev/null
+++ b/libc/src/math/amdgpu/erff.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU erff function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/erff.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, erff, (float x)) { return __ocml_erf_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/exp.cpp b/libc/src/math/amdgpu/exp.cpp
index ee5a22019f6a..8590ac759019 100644
--- a/libc/src/math/gpu/vendor/exp.cpp
+++ b/libc/src/math/amdgpu/exp.cpp
@@ -9,10 +9,10 @@
 #include "src/math/exp.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, exp, (double x)) { return internal::exp(x); }
+LLVM_LIBC_FUNCTION(double, exp, (double x)) { return __builtin_exp(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/exp10.cpp b/libc/src/math/amdgpu/exp10.cpp
new file mode 100644
index 000000000000..17d8f3350ac2
--- /dev/null
+++ b/libc/src/math/amdgpu/exp10.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU exp10 function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/exp10.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, exp10, (double x)) { return __ocml_exp10_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/exp10f.cpp b/libc/src/math/amdgpu/exp10f.cpp
new file mode 100644
index 000000000000..ddab555a8fbd
--- /dev/null
+++ b/libc/src/math/amdgpu/exp10f.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the exp10f function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/exp10f.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, exp10f, (float x)) { return __ocml_exp10_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/exp2.cpp b/libc/src/math/amdgpu/exp2.cpp
new file mode 100644
index 000000000000..dfbb1f80d129
--- /dev/null
+++ b/libc/src/math/amdgpu/exp2.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU exp2 function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/exp2.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, exp2, (double x)) { return __ocml_exp2_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/exp2f.cpp b/libc/src/math/amdgpu/exp2f.cpp
new file mode 100644
index 000000000000..016dfe3a6222
--- /dev/null
+++ b/libc/src/math/amdgpu/exp2f.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the exp2f function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/exp2f.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, exp2f, (float x)) { return __ocml_exp2_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/expf.cpp b/libc/src/math/amdgpu/expf.cpp
index 89c194e4bc29..d682f6293a6c 100644
--- a/libc/src/math/gpu/vendor/expf.cpp
+++ b/libc/src/math/amdgpu/expf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/expf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, expf, (float x)) { return internal::expf(x); }
+LLVM_LIBC_FUNCTION(float, expf, (float x)) { return __builtin_expf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/expm1.cpp b/libc/src/math/amdgpu/expm1.cpp
new file mode 100644
index 000000000000..d2ac28ae6a3e
--- /dev/null
+++ b/libc/src/math/amdgpu/expm1.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU expm1 function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/expm1.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, expm1, (double x)) { return __ocml_expm1_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/expm1f.cpp b/libc/src/math/amdgpu/expm1f.cpp
new file mode 100644
index 000000000000..0ffe1a362af1
--- /dev/null
+++ b/libc/src/math/amdgpu/expm1f.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the expm1f function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/expm1f.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, expm1f, (float x)) { return __ocml_expm1_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/fabs.cpp b/libc/src/math/amdgpu/fabs.cpp
index c0d063d50ae5..c0d063d50ae5 100644
--- a/libc/src/math/gpu/fabs.cpp
+++ b/libc/src/math/amdgpu/fabs.cpp
diff --git a/libc/src/math/gpu/fabsf.cpp b/libc/src/math/amdgpu/fabsf.cpp
index 398ffd0c74c0..398ffd0c74c0 100644
--- a/libc/src/math/gpu/fabsf.cpp
+++ b/libc/src/math/amdgpu/fabsf.cpp
diff --git a/libc/src/math/amdgpu/fdim.cpp b/libc/src/math/amdgpu/fdim.cpp
new file mode 100644
index 000000000000..f16942dc6193
--- /dev/null
+++ b/libc/src/math/amdgpu/fdim.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the fdim function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fdim.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, fdim, (double x, double y)) {
+  return __ocml_fdim_f64(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/fdimf.cpp b/libc/src/math/amdgpu/fdimf.cpp
new file mode 100644
index 000000000000..eccb441f1455
--- /dev/null
+++ b/libc/src/math/amdgpu/fdimf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the fdimf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fdimf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, fdimf, (float x, float y)) {
+  return __ocml_fdim_f32(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/floor.cpp b/libc/src/math/amdgpu/floor.cpp
index eada89c178d7..eada89c178d7 100644
--- a/libc/src/math/gpu/floor.cpp
+++ b/libc/src/math/amdgpu/floor.cpp
diff --git a/libc/src/math/gpu/floorf.cpp b/libc/src/math/amdgpu/floorf.cpp
index a5611c515a88..a5611c515a88 100644
--- a/libc/src/math/gpu/floorf.cpp
+++ b/libc/src/math/amdgpu/floorf.cpp
diff --git a/libc/src/math/gpu/fma.cpp b/libc/src/math/amdgpu/fma.cpp
index 41a6ddf60dbc..41a6ddf60dbc 100644
--- a/libc/src/math/gpu/fma.cpp
+++ b/libc/src/math/amdgpu/fma.cpp
diff --git a/libc/src/math/gpu/fmaf.cpp b/libc/src/math/amdgpu/fmaf.cpp
index c948e32f77eb..c948e32f77eb 100644
--- a/libc/src/math/gpu/fmaf.cpp
+++ b/libc/src/math/amdgpu/fmaf.cpp
diff --git a/libc/src/math/amdgpu/fmax.cpp b/libc/src/math/amdgpu/fmax.cpp
new file mode 100644
index 000000000000..09624cc6f092
--- /dev/null
+++ b/libc/src/math/amdgpu/fmax.cpp
@@ -0,0 +1,25 @@
+//===-- Implementation of the fmax function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmax.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, fmax, (double x, double y)) {
+  // FIXME: The builtin function does not correctly handle the +/-0.0 case.
+  if (LIBC_UNLIKELY(x == y))
+    return cpp::bit_cast<double>(cpp::bit_cast<uint64_t>(x) &
+                                 cpp::bit_cast<uint64_t>(y));
+  return __builtin_fmax(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/fmaxf.cpp b/libc/src/math/amdgpu/fmaxf.cpp
new file mode 100644
index 000000000000..f6ed46699a04
--- /dev/null
+++ b/libc/src/math/amdgpu/fmaxf.cpp
@@ -0,0 +1,25 @@
+//===-- Implementation of the fmaxf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmaxf.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, fmaxf, (float x, float y)) {
+  // FIXME: The builtin function does not correctly handle the +/-0.0 case.
+  if (LIBC_UNLIKELY(x == y))
+    return cpp::bit_cast<float>(cpp::bit_cast<uint32_t>(x) &
+                                cpp::bit_cast<uint32_t>(y));
+  return __builtin_fmaxf(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/fmin.cpp b/libc/src/math/amdgpu/fmin.cpp
new file mode 100644
index 000000000000..8977ff7a066c
--- /dev/null
+++ b/libc/src/math/amdgpu/fmin.cpp
@@ -0,0 +1,25 @@
+//===-- Implementation of the fmin function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmin.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, fmin, (double x, double y)) {
+  // FIXME: The builtin function does not correctly handle the +/-0.0 case.
+  if (LIBC_UNLIKELY(x == y))
+    return cpp::bit_cast<double>(cpp::bit_cast<uint64_t>(x) |
+                                 cpp::bit_cast<uint64_t>(y));
+  return __builtin_fmin(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/fminf.cpp b/libc/src/math/amdgpu/fminf.cpp
new file mode 100644
index 000000000000..3be55257f616
--- /dev/null
+++ b/libc/src/math/amdgpu/fminf.cpp
@@ -0,0 +1,25 @@
+//===-- Implementation of the fminf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fminf.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, fminf, (float x, float y)) {
+  // FIXME: The builtin function does not correctly handle the +/-0.0 case.
+  if (LIBC_UNLIKELY(x == y))
+    return cpp::bit_cast<float>(cpp::bit_cast<uint32_t>(x) |
+                                cpp::bit_cast<uint32_t>(y));
+  return __builtin_fminf(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/fmod.cpp b/libc/src/math/amdgpu/fmod.cpp
index 0654cdd2abe0..0654cdd2abe0 100644
--- a/libc/src/math/gpu/fmod.cpp
+++ b/libc/src/math/amdgpu/fmod.cpp
diff --git a/libc/src/math/gpu/fmodf.cpp b/libc/src/math/amdgpu/fmodf.cpp
index b689046468fb..b689046468fb 100644
--- a/libc/src/math/gpu/fmodf.cpp
+++ b/libc/src/math/amdgpu/fmodf.cpp
diff --git a/libc/src/math/amdgpu/frexp.cpp b/libc/src/math/amdgpu/frexp.cpp
new file mode 100644
index 000000000000..0acf97342fa0
--- /dev/null
+++ b/libc/src/math/amdgpu/frexp.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the frexp function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/frexp.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, frexp, (double x, int *p)) {
+  return __builtin_frexp(x, p);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/frexpf.cpp b/libc/src/math/amdgpu/frexpf.cpp
new file mode 100644
index 000000000000..d870bf3095b2
--- /dev/null
+++ b/libc/src/math/amdgpu/frexpf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the frexpf function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/frexpf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, frexpf, (float x, int *p)) {
+  return __builtin_frexpf(x, p);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/hypot.cpp b/libc/src/math/amdgpu/hypot.cpp
new file mode 100644
index 000000000000..ffc13504c8f4
--- /dev/null
+++ b/libc/src/math/amdgpu/hypot.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the hypot function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/hypot.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, hypot, (double x, double y)) {
+  return __ocml_hypot_f64(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/hypotf.cpp b/libc/src/math/amdgpu/hypotf.cpp
new file mode 100644
index 000000000000..811fc540488d
--- /dev/null
+++ b/libc/src/math/amdgpu/hypotf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the hypotf function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/hypotf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, hypotf, (float x, float y)) {
+  return __ocml_hypot_f32(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/ilogb.cpp b/libc/src/math/amdgpu/ilogb.cpp
new file mode 100644
index 000000000000..4479908d3856
--- /dev/null
+++ b/libc/src/math/amdgpu/ilogb.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the ilogb function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/ilogb.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, ilogb, (double x)) { return __ocml_ilogb_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/ilogbf.cpp b/libc/src/math/amdgpu/ilogbf.cpp
new file mode 100644
index 000000000000..cded285c72f3
--- /dev/null
+++ b/libc/src/math/amdgpu/ilogbf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the ilogbf function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/ilogbf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, ilogbf, (float x)) { return __ocml_ilogb_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/ldexp.cpp b/libc/src/math/amdgpu/ldexp.cpp
new file mode 100644
index 000000000000..70c5b0d6e555
--- /dev/null
+++ b/libc/src/math/amdgpu/ldexp.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the ldexp function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/ldexp.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, ldexp, (double x, int y)) {
+  return __builtin_ldexp(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/ldexpf.cpp b/libc/src/math/amdgpu/ldexpf.cpp
new file mode 100644
index 000000000000..8dc7c132ee21
--- /dev/null
+++ b/libc/src/math/amdgpu/ldexpf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the ldexpf function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/ldexpf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, ldexpf, (float x, int y)) {
+  return __builtin_ldexpf(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/llrint.cpp b/libc/src/math/amdgpu/llrint.cpp
index aafd1609002b..307420a9b8b2 100644
--- a/libc/src/math/gpu/vendor/llrint.cpp
+++ b/libc/src/math/amdgpu/llrint.cpp
@@ -9,12 +9,12 @@
 #include "src/math/llrint.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(long long, llrint, (double x)) {
-  return internal::llrint(x);
+  return static_cast<long long>(__builtin_rint(x));
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/llrintf.cpp b/libc/src/math/amdgpu/llrintf.cpp
index 39cd3ad0c021..23404990fb1b 100644
--- a/libc/src/math/gpu/vendor/llrintf.cpp
+++ b/libc/src/math/amdgpu/llrintf.cpp
@@ -9,12 +9,12 @@
 #include "src/math/llrintf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(long long, llrintf, (float x)) {
-  return internal::llrintf(x);
+  return static_cast<long long>(__builtin_rintf(x));
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/llround.cpp b/libc/src/math/amdgpu/llround.cpp
index afd98308730a..afd98308730a 100644
--- a/libc/src/math/gpu/llround.cpp
+++ b/libc/src/math/amdgpu/llround.cpp
diff --git a/libc/src/math/gpu/llroundf.cpp b/libc/src/math/amdgpu/llroundf.cpp
index 897ed15b6928..897ed15b6928 100644
--- a/libc/src/math/gpu/llroundf.cpp
+++ b/libc/src/math/amdgpu/llroundf.cpp
diff --git a/libc/src/math/amdgpu/log.cpp b/libc/src/math/amdgpu/log.cpp
new file mode 100644
index 000000000000..3f2489580356
--- /dev/null
+++ b/libc/src/math/amdgpu/log.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU log function ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/log.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, log, (double x)) { return __ocml_log_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/log10.cpp b/libc/src/math/amdgpu/log10.cpp
new file mode 100644
index 000000000000..d522d5e84291
--- /dev/null
+++ b/libc/src/math/amdgpu/log10.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU log10 function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/log10.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, log10, (double x)) { return __ocml_log10_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/log10f.cpp b/libc/src/math/amdgpu/log10f.cpp
new file mode 100644
index 000000000000..47b9b162ad9d
--- /dev/null
+++ b/libc/src/math/amdgpu/log10f.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU log10f function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/log10f.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, log10f, (float x)) { return __ocml_log10_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/log1p.cpp b/libc/src/math/amdgpu/log1p.cpp
new file mode 100644
index 000000000000..fae60e448586
--- /dev/null
+++ b/libc/src/math/amdgpu/log1p.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU log1p function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/log1p.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, log1p, (double x)) { return __ocml_log1p_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/log1pf.cpp b/libc/src/math/amdgpu/log1pf.cpp
new file mode 100644
index 000000000000..e7b1772158fc
--- /dev/null
+++ b/libc/src/math/amdgpu/log1pf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU log1pf function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/log1pf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, log1pf, (float x)) { return __ocml_log1p_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/log2.cpp b/libc/src/math/amdgpu/log2.cpp
new file mode 100644
index 000000000000..9d84f62dff6f
--- /dev/null
+++ b/libc/src/math/amdgpu/log2.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU log2 function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/log2.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, log2, (double x)) { return __ocml_log2_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/log2f.cpp b/libc/src/math/amdgpu/log2f.cpp
new file mode 100644
index 000000000000..7742a6141c37
--- /dev/null
+++ b/libc/src/math/amdgpu/log2f.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU log2f function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/log2f.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, log2f, (float x)) { return __ocml_log2_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/logb.cpp b/libc/src/math/amdgpu/logb.cpp
new file mode 100644
index 000000000000..1344fbb182af
--- /dev/null
+++ b/libc/src/math/amdgpu/logb.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU logb function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/logb.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, logb, (double x)) { return __ocml_logb_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/logbf.cpp b/libc/src/math/amdgpu/logbf.cpp
new file mode 100644
index 000000000000..fdb493fe28da
--- /dev/null
+++ b/libc/src/math/amdgpu/logbf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU logbf function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/logbf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, logbf, (float x)) { return __ocml_logb_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/logf.cpp b/libc/src/math/amdgpu/logf.cpp
new file mode 100644
index 000000000000..d4d4b265ee5c
--- /dev/null
+++ b/libc/src/math/amdgpu/logf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU logf function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/logf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, logf, (float x)) { return __ocml_log_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/lrint.cpp b/libc/src/math/amdgpu/lrint.cpp
new file mode 100644
index 000000000000..b335b4f06393
--- /dev/null
+++ b/libc/src/math/amdgpu/lrint.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the lrint function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/lrint.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long, lrint, (double x)) {
+  return static_cast<long>(__builtin_rint(x));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/lrintf.cpp b/libc/src/math/amdgpu/lrintf.cpp
new file mode 100644
index 000000000000..7959e76728a4
--- /dev/null
+++ b/libc/src/math/amdgpu/lrintf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the lrintf function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/lrintf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long, lrintf, (float x)) {
+  return static_cast<long>(__builtin_rintf(x));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/lround.cpp b/libc/src/math/amdgpu/lround.cpp
index 51e8f2245af8..51e8f2245af8 100644
--- a/libc/src/math/gpu/lround.cpp
+++ b/libc/src/math/amdgpu/lround.cpp
diff --git a/libc/src/math/gpu/lroundf.cpp b/libc/src/math/amdgpu/lroundf.cpp
index 2a6fe7200d8c..2a6fe7200d8c 100644
--- a/libc/src/math/gpu/lroundf.cpp
+++ b/libc/src/math/amdgpu/lroundf.cpp
diff --git a/libc/src/math/gpu/modf.cpp b/libc/src/math/amdgpu/modf.cpp
index 07dbbd6059c3..07dbbd6059c3 100644
--- a/libc/src/math/gpu/modf.cpp
+++ b/libc/src/math/amdgpu/modf.cpp
diff --git a/libc/src/math/gpu/modff.cpp b/libc/src/math/amdgpu/modff.cpp
index ad35f9006b51..ad35f9006b51 100644
--- a/libc/src/math/gpu/modff.cpp
+++ b/libc/src/math/amdgpu/modff.cpp
diff --git a/libc/src/math/gpu/nearbyint.cpp b/libc/src/math/amdgpu/nearbyint.cpp
index 9c7b600df708..9c7b600df708 100644
--- a/libc/src/math/gpu/nearbyint.cpp
+++ b/libc/src/math/amdgpu/nearbyint.cpp
diff --git a/libc/src/math/gpu/nearbyintf.cpp b/libc/src/math/amdgpu/nearbyintf.cpp
index 7fbe9f4f0e0b..7fbe9f4f0e0b 100644
--- a/libc/src/math/gpu/nearbyintf.cpp
+++ b/libc/src/math/amdgpu/nearbyintf.cpp
diff --git a/libc/src/math/amdgpu/nextafter.cpp b/libc/src/math/amdgpu/nextafter.cpp
new file mode 100644
index 000000000000..5c74ef165268
--- /dev/null
+++ b/libc/src/math/amdgpu/nextafter.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the nextafter function for GPU ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nextafter.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, nextafter, (double x, double y)) {
+  return __ocml_nextafter_f64(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/nextafterf.cpp b/libc/src/math/amdgpu/nextafterf.cpp
new file mode 100644
index 000000000000..a97b990a61fc
--- /dev/null
+++ b/libc/src/math/amdgpu/nextafterf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the nextafterf function for GPU -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nextafterf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, nextafterf, (float x, float y)) {
+  return __ocml_nextafter_f32(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/amdgpu/platform.h b/libc/src/math/amdgpu/platform.h
index e5a9f810cd10..e5a9f810cd10 100644
--- a/libc/src/math/gpu/vendor/amdgpu/platform.h
+++ b/libc/src/math/amdgpu/platform.h
diff --git a/libc/src/math/gpu/vendor/pow.cpp b/libc/src/math/amdgpu/pow.cpp
index d49f2610a691..e5056f67292d 100644
--- a/libc/src/math/gpu/vendor/pow.cpp
+++ b/libc/src/math/amdgpu/pow.cpp
@@ -9,12 +9,12 @@
 #include "src/math/pow.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, pow, (double x, double y)) {
-  return internal::pow(x, y);
+  return __ocml_pow_f64(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/powf.cpp b/libc/src/math/amdgpu/powf.cpp
index 37e02d252b74..6114bcc642e1 100644
--- a/libc/src/math/gpu/vendor/powf.cpp
+++ b/libc/src/math/amdgpu/powf.cpp
@@ -9,12 +9,12 @@
 #include "src/math/powf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
-  return internal::powf(x, y);
+  return __ocml_pow_f32(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/remainder.cpp b/libc/src/math/amdgpu/remainder.cpp
index 89b235f9c22a..89b235f9c22a 100644
--- a/libc/src/math/gpu/remainder.cpp
+++ b/libc/src/math/amdgpu/remainder.cpp
diff --git a/libc/src/math/gpu/remainderf.cpp b/libc/src/math/amdgpu/remainderf.cpp
index 9fee6f856dc8..9fee6f856dc8 100644
--- a/libc/src/math/gpu/remainderf.cpp
+++ b/libc/src/math/amdgpu/remainderf.cpp
diff --git a/libc/src/math/amdgpu/remquo.cpp b/libc/src/math/amdgpu/remquo.cpp
new file mode 100644
index 000000000000..d8074a9626ec
--- /dev/null
+++ b/libc/src/math/amdgpu/remquo.cpp
@@ -0,0 +1,23 @@
+//===-- Implementation of the GPU remquo function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/remquo.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, remquo, (double x, double y, int *quo)) {
+  int tmp;
+  double r = __ocml_remquo_f64(x, y, (gpu::Private<int> *)&tmp);
+  *quo = tmp;
+  return r;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/remquof.cpp b/libc/src/math/amdgpu/remquof.cpp
new file mode 100644
index 000000000000..b6584dfb97c3
--- /dev/null
+++ b/libc/src/math/amdgpu/remquof.cpp
@@ -0,0 +1,23 @@
+//===-- Implementation of the GPU remquof function ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/remquof.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, remquof, (float x, float y, int *quo)) {
+  int tmp;
+  float r = __ocml_remquo_f32(x, y, (gpu::Private<int> *)&tmp);
+  *quo = tmp;
+  return r;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/rint.cpp b/libc/src/math/amdgpu/rint.cpp
index 44d494a8ed57..44d494a8ed57 100644
--- a/libc/src/math/gpu/rint.cpp
+++ b/libc/src/math/amdgpu/rint.cpp
diff --git a/libc/src/math/gpu/rintf.cpp b/libc/src/math/amdgpu/rintf.cpp
index daf98d943605..daf98d943605 100644
--- a/libc/src/math/gpu/rintf.cpp
+++ b/libc/src/math/amdgpu/rintf.cpp
diff --git a/libc/src/math/gpu/round.cpp b/libc/src/math/amdgpu/round.cpp
index 9d8b5582f040..9d8b5582f040 100644
--- a/libc/src/math/gpu/round.cpp
+++ b/libc/src/math/amdgpu/round.cpp
diff --git a/libc/src/math/gpu/roundf.cpp b/libc/src/math/amdgpu/roundf.cpp
index 8743e4eb7fb8..8743e4eb7fb8 100644
--- a/libc/src/math/gpu/roundf.cpp
+++ b/libc/src/math/amdgpu/roundf.cpp
diff --git a/libc/src/math/amdgpu/scalbn.cpp b/libc/src/math/amdgpu/scalbn.cpp
new file mode 100644
index 000000000000..c2a43e03a7bc
--- /dev/null
+++ b/libc/src/math/amdgpu/scalbn.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the GPU scalbn function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/scalbn.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, scalbn, (double x, int y)) {
+  return __builtin_amdgcn_ldexp(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/scalbnf.cpp b/libc/src/math/amdgpu/scalbnf.cpp
new file mode 100644
index 000000000000..63de26ccbc47
--- /dev/null
+++ b/libc/src/math/amdgpu/scalbnf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the GPU scalbnf function ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/scalbnf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, scalbnf, (float x, int y)) {
+  return __builtin_amdgcn_ldexpf(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/sin.cpp b/libc/src/math/amdgpu/sin.cpp
new file mode 100644
index 000000000000..dbc29a725db0
--- /dev/null
+++ b/libc/src/math/amdgpu/sin.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the sin function for GPU ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sin.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, sin, (double x)) { return __ocml_sin_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/sincos.cpp b/libc/src/math/amdgpu/sincos.cpp
new file mode 100644
index 000000000000..7cdd0d1f9769
--- /dev/null
+++ b/libc/src/math/amdgpu/sincos.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the sincos function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sincos.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(void, sincos, (double x, double *sinptr, double *cosptr)) {
+  *sinptr = __ocml_sincos_f64(x, cosptr);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/sincosf.cpp b/libc/src/math/amdgpu/sincosf.cpp
index 17892ceb107d..37a5e2a6d11c 100644
--- a/libc/src/math/gpu/vendor/sincosf.cpp
+++ b/libc/src/math/amdgpu/sincosf.cpp
@@ -9,12 +9,12 @@
 #include "src/math/sincosf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(void, sincosf, (float x, float *sinptr, float *cosptr)) {
-  return internal::sincosf(x, sinptr, cosptr);
+  *sinptr = __ocml_sincos_f32(x, cosptr);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/sinf.cpp b/libc/src/math/amdgpu/sinf.cpp
new file mode 100644
index 000000000000..cda2c626f8ac
--- /dev/null
+++ b/libc/src/math/amdgpu/sinf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the sinf function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sinf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, sinf, (float x)) { return __ocml_sin_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/sinh.cpp b/libc/src/math/amdgpu/sinh.cpp
index 054e046f2abd..66cacd19e463 100644
--- a/libc/src/math/gpu/sinh.cpp
+++ b/libc/src/math/amdgpu/sinh.cpp
@@ -1,4 +1,4 @@
-//===-- Implementation of the GPU sinh function ---------------------------===//
+//===-- Implementation of the sinh function for GPU -----------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,8 +9,10 @@
 #include "src/math/sinh.h"
 #include "src/__support/common.h"
 
+#include "declarations.h"
+
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, sinh, (double x)) { return __builtin_sinh(x); }
+LLVM_LIBC_FUNCTION(double, sinh, (double x)) { return __ocml_sinh_f64(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/sinhf.cpp b/libc/src/math/amdgpu/sinhf.cpp
new file mode 100644
index 000000000000..5d3f5ea6b36a
--- /dev/null
+++ b/libc/src/math/amdgpu/sinhf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the sinhf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sinhf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, sinhf, (float x)) { return __ocml_sinh_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/sqrt.cpp b/libc/src/math/amdgpu/sqrt.cpp
index 60ca5af4987b..60ca5af4987b 100644
--- a/libc/src/math/gpu/sqrt.cpp
+++ b/libc/src/math/amdgpu/sqrt.cpp
diff --git a/libc/src/math/gpu/sqrtf.cpp b/libc/src/math/amdgpu/sqrtf.cpp
index e17f942a4d5f..e17f942a4d5f 100644
--- a/libc/src/math/gpu/sqrtf.cpp
+++ b/libc/src/math/amdgpu/sqrtf.cpp
diff --git a/libc/src/math/gpu/tan.cpp b/libc/src/math/amdgpu/tan.cpp
index d02b10635612..6121a9319a2a 100644
--- a/libc/src/math/gpu/tan.cpp
+++ b/libc/src/math/amdgpu/tan.cpp
@@ -1,4 +1,4 @@
-//===-- Implementation of the GPU tan function ----------------------------===//
+//===-- Implementation of the tan function for GPU ------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,8 +9,10 @@
 #include "src/math/tan.h"
 #include "src/__support/common.h"
 
+#include "declarations.h"
+
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, tan, (double x)) { return __builtin_tan(x); }
+LLVM_LIBC_FUNCTION(double, tan, (double x)) { return __ocml_tan_f64(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/tanf.cpp b/libc/src/math/amdgpu/tanf.cpp
new file mode 100644
index 000000000000..fdd83ee7aeb9
--- /dev/null
+++ b/libc/src/math/amdgpu/tanf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the tanf function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/tanf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, tanf, (float x)) { return __ocml_tan_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/tanh.cpp b/libc/src/math/amdgpu/tanh.cpp
index 778e883acba0..25a9c2954bf3 100644
--- a/libc/src/math/gpu/tanh.cpp
+++ b/libc/src/math/amdgpu/tanh.cpp
@@ -1,4 +1,4 @@
-//===-- Implementation of the GPU tanh function ---------------------------===//
+//===-- Implementation of the tanh function for GPU -----------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,8 +9,10 @@
 #include "src/math/tanh.h"
 #include "src/__support/common.h"
 
+#include "declarations.h"
+
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, tanh, (double x)) { return __builtin_tanh(x); }
+LLVM_LIBC_FUNCTION(double, tanh, (double x)) { return __ocml_tanh_f64(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/tanhf.cpp b/libc/src/math/amdgpu/tanhf.cpp
new file mode 100644
index 000000000000..a4bfd2095eb3
--- /dev/null
+++ b/libc/src/math/amdgpu/tanhf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the tanhf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/tanhf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, tanhf, (float x)) { return __ocml_tanh_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/tgamma.cpp b/libc/src/math/amdgpu/tgamma.cpp
new file mode 100644
index 000000000000..10f58d566f88
--- /dev/null
+++ b/libc/src/math/amdgpu/tgamma.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU tgamma function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/tgamma.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, tgamma, (double x)) { return __ocml_tgamma_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/tgammaf.cpp b/libc/src/math/amdgpu/tgammaf.cpp
new file mode 100644
index 000000000000..e7d22059a7c4
--- /dev/null
+++ b/libc/src/math/amdgpu/tgammaf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU tgammaf function ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/tgammaf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, tgammaf, (float x)) { return __ocml_tgamma_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/trunc.cpp b/libc/src/math/amdgpu/trunc.cpp
index 773600f0f250..773600f0f250 100644
--- a/libc/src/math/gpu/trunc.cpp
+++ b/libc/src/math/amdgpu/trunc.cpp
diff --git a/libc/src/math/gpu/truncf.cpp b/libc/src/math/amdgpu/truncf.cpp
index 534797a3e586..534797a3e586 100644
--- a/libc/src/math/gpu/truncf.cpp
+++ b/libc/src/math/amdgpu/truncf.cpp
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 2ef13168bce8..120ada8202ab 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -969,10 +969,10 @@ add_entrypoint_object(
     ilogb.cpp
   HDRS
     ../ilogb.h
+  COMPILE_OPTIONS
+    -O3
   DEPENDS
     libc.src.__support.FPUtil.manipulation_functions
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -981,10 +981,10 @@ add_entrypoint_object(
     ilogbf.cpp
   HDRS
     ../ilogbf.h
+  COMPILE_OPTIONS
+    -O3
   DEPENDS
     libc.src.__support.FPUtil.manipulation_functions
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -993,10 +993,72 @@ add_entrypoint_object(
     ilogbl.cpp
   HDRS
     ../ilogbl.h
+  COMPILE_OPTIONS
+    -O3
   DEPENDS
     libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_entrypoint_object(
+  ilogbf128
+  SRCS
+    ilogbf128.cpp
+  HDRS
+    ../ilogbf128.h
   COMPILE_OPTIONS
-    -O2
+    -O3
+  DEPENDS
+    libc.src.__support.macros.properties.float
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_entrypoint_object(
+  llogb
+  SRCS
+    llogb.cpp
+  HDRS
+    ../llogb.h
+  COMPILE_OPTIONS
+    -O3
+  DEPENDS
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_entrypoint_object(
+  llogbf
+  SRCS
+    llogbf.cpp
+  HDRS
+    ../llogbf.h
+  COMPILE_OPTIONS
+    -O3
+  DEPENDS
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_entrypoint_object(
+  llogbl
+  SRCS
+    llogbl.cpp
+  HDRS
+    ../llogbl.h
+  COMPILE_OPTIONS
+    -O3
+  DEPENDS
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_entrypoint_object(
+  llogbf128
+  SRCS
+    llogbf128.cpp
+  HDRS
+    ../llogbf128.h
+  COMPILE_OPTIONS
+    -O3
+  DEPENDS
+    libc.src.__support.macros.properties.float
+    libc.src.__support.FPUtil.manipulation_functions
 )
 
 add_entrypoint_object(
@@ -1044,8 +1106,8 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O3
   DEPENDS
-  libc.src.__support.macros.properties.float
-  libc.src.__support.FPUtil.manipulation_functions
+    libc.src.__support.macros.properties.float
+    libc.src.__support.FPUtil.manipulation_functions
 )
 
 add_object_library(
@@ -1229,10 +1291,10 @@ add_entrypoint_object(
     logb.cpp
   HDRS
     ../logb.h
+  COMPILE_OPTIONS
+    -O3
   DEPENDS
     libc.src.__support.FPUtil.manipulation_functions
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -1241,10 +1303,10 @@ add_entrypoint_object(
     logbf.cpp
   HDRS
     ../logbf.h
+  COMPILE_OPTIONS
+    -O3
   DEPENDS
     libc.src.__support.FPUtil.manipulation_functions
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -1253,10 +1315,22 @@ add_entrypoint_object(
     logbl.cpp
   HDRS
     ../logbl.h
+  COMPILE_OPTIONS
+    -O3
   DEPENDS
     libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_entrypoint_object(
+  logbf128
+  SRCS
+    logbf128.cpp
+  HDRS
+    ../logbf128.h
   COMPILE_OPTIONS
-    -O2
+    -O3
+  DEPENDS
+    libc.src.__support.FPUtil.manipulation_functions
 )
 
 add_entrypoint_object(
diff --git a/libc/src/math/generic/ilogb.cpp b/libc/src/math/generic/ilogb.cpp
index 4e5f7d9642b4..7e4f66970c5d 100644
--- a/libc/src/math/generic/ilogb.cpp
+++ b/libc/src/math/generic/ilogb.cpp
@@ -12,6 +12,6 @@
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(int, ilogb, (double x)) { return fputil::ilogb(x); }
+LLVM_LIBC_FUNCTION(int, ilogb, (double x)) { return fputil::intlogb<int>(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/ilogbf.cpp b/libc/src/math/generic/ilogbf.cpp
index ca15879bc25f..422788cec9e0 100644
--- a/libc/src/math/generic/ilogbf.cpp
+++ b/libc/src/math/generic/ilogbf.cpp
@@ -12,6 +12,6 @@
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(int, ilogbf, (float x)) { return fputil::ilogb(x); }
+LLVM_LIBC_FUNCTION(int, ilogbf, (float x)) { return fputil::intlogb<int>(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/ilogbf128.cpp b/libc/src/math/generic/ilogbf128.cpp
new file mode 100644
index 000000000000..4049eccc5f36
--- /dev/null
+++ b/libc/src/math/generic/ilogbf128.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of ilogbf128 function ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/ilogbf128.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, ilogbf128, (float128 x)) {
+  return fputil::intlogb<int>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/ilogbl.cpp b/libc/src/math/generic/ilogbl.cpp
index 4c18daab1a53..b7f7eb40c441 100644
--- a/libc/src/math/generic/ilogbl.cpp
+++ b/libc/src/math/generic/ilogbl.cpp
@@ -12,6 +12,8 @@
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(int, ilogbl, (long double x)) { return fputil::ilogb(x); }
+LLVM_LIBC_FUNCTION(int, ilogbl, (long double x)) {
+  return fputil::intlogb<int>(x);
+}
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/llogb.cpp b/libc/src/math/generic/llogb.cpp
new file mode 100644
index 000000000000..917bc38c0379
--- /dev/null
+++ b/libc/src/math/generic/llogb.cpp
@@ -0,0 +1,17 @@
+//===-- Implementation of llogb function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/llogb.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long, llogb, (double x)) { return fputil::intlogb<long>(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/llogbf.cpp b/libc/src/math/generic/llogbf.cpp
new file mode 100644
index 000000000000..ca1c03db5c2e
--- /dev/null
+++ b/libc/src/math/generic/llogbf.cpp
@@ -0,0 +1,17 @@
+//===-- Implementation of llogbf function ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/llogbf.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long, llogbf, (float x)) { return fputil::intlogb<long>(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/llogbf128.cpp b/libc/src/math/generic/llogbf128.cpp
new file mode 100644
index 000000000000..5ae4af302110
--- /dev/null
+++ b/libc/src/math/generic/llogbf128.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of llogbf128 function ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/llogbf128.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long, llogbf128, (float128 x)) {
+  return fputil::intlogb<long>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/llogbl.cpp b/libc/src/math/generic/llogbl.cpp
new file mode 100644
index 000000000000..a092997b9244
--- /dev/null
+++ b/libc/src/math/generic/llogbl.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of llogbl function ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/llogbl.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long, llogbl, (long double x)) {
+  return fputil::intlogb<long>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/logbf.cpp b/libc/src/math/generic/logbf.cpp
index 78aa33ebbf4a..9f9f7fbcfbb8 100644
--- a/libc/src/math/generic/logbf.cpp
+++ b/libc/src/math/generic/logbf.cpp
@@ -1,4 +1,4 @@
-//===-- Implementation of logbf function ---------------------------------===//
+//===-- Implementation of logbf function ----------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/libc/src/math/generic/logbf128.cpp b/libc/src/math/generic/logbf128.cpp
new file mode 100644
index 000000000000..090433d1fb86
--- /dev/null
+++ b/libc/src/math/generic/logbf128.cpp
@@ -0,0 +1,17 @@
+//===-- Implementation of logbf128 function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/logbf128.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float128, logbf128, (float128 x)) { return fputil::logb(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/CMakeLists.txt b/libc/src/math/gpu/CMakeLists.txt
deleted file mode 100644
index 75a916e2a011..000000000000
--- a/libc/src/math/gpu/CMakeLists.txt
+++ /dev/null
@@ -1,384 +0,0 @@
-# Math functions not yet available in the libc project, or those not yet tuned
-# for GPU workloads are provided as wrappers over vendor libraries. If we find
-# them ahead of time we will import them statically. Otherwise, we will keep
-# them as external references and expect them to be resolved by the user when
-# they compile. In the future,we will use implementations from the 'libc'
-# project and not provide these wrappers.
-add_subdirectory(vendor)
-
-# For the GPU we want to be able to optionally depend on the vendor libraries
-# until we have a suitable replacement inside `libc`.
-# TODO: We should have an option to enable or disable these on a per-function
-# basis.
-option(LIBC_GPU_VENDOR_MATH "Use vendor wrappers for GPU math" ON)
-function(add_math_entrypoint_gpu_object name)
-  get_fq_target_name("vendor.${name}" fq_vendor_specific_target_name)
-  if(TARGET ${fq_vendor_specific_target_name} AND ${LIBC_GPU_VENDOR_MATH})
-    return()
-  endif()
-
-  add_entrypoint_object(
-    ${name}
-    ${ARGN}
-  )
-endfunction()
-
-add_math_entrypoint_gpu_object(
-  ceil
-  SRCS
-    ceil.cpp
-  HDRS
-    ../ceil.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  ceilf
-  SRCS
-    ceilf.cpp
-  HDRS
-    ../ceilf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  copysign
-  SRCS
-    copysign.cpp
-  HDRS
-    ../copysign.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  copysignf
-  SRCS
-    copysignf.cpp
-  HDRS
-    ../copysignf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  fabs
-  SRCS
-    fabs.cpp
-  HDRS
-    ../fabs.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  fabsf
-  SRCS
-    fabsf.cpp
-  HDRS
-    ../fabsf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  floor
-  SRCS
-    floor.cpp
-  HDRS
-    ../floor.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  floorf
-  SRCS
-    floorf.cpp
-  HDRS
-    ../floorf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  fma
-  SRCS
-    fma.cpp
-  HDRS
-    ../fma.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  fmaf
-  SRCS
-    fmaf.cpp
-  HDRS
-    ../fmaf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  fmax
-  SRCS
-    fmax.cpp
-  HDRS
-    ../fmax.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  fmaxf
-  SRCS
-    fmaxf.cpp
-  HDRS
-    ../fmaxf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  fmin
-  SRCS
-    fmin.cpp
-  HDRS
-    ../fmin.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  fminf
-  SRCS
-    fminf.cpp
-  HDRS
-    ../fminf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  fmod
-  SRCS
-    fmod.cpp
-  HDRS
-    ../fmod.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  fmodf
-  SRCS
-    fmodf.cpp
-  HDRS
-    ../fmodf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  lround
-  SRCS
-    lround.cpp
-  HDRS
-    ../lround.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  lroundf
-  SRCS
-    lroundf.cpp
-  HDRS
-    ../lroundf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  llround
-  SRCS
-    llround.cpp
-  HDRS
-    ../llround.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  llroundf
-  SRCS
-    llroundf.cpp
-  HDRS
-    ../llroundf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  modf
-  SRCS
-    modf.cpp
-  HDRS
-    ../modf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  modff
-  SRCS
-    modff.cpp
-  HDRS
-    ../modff.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  nearbyint
-  SRCS
-    nearbyint.cpp
-  HDRS
-    ../nearbyint.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  nearbyintf
-  SRCS
-    nearbyintf.cpp
-  HDRS
-    ../nearbyintf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  remainder
-  SRCS
-    remainder.cpp
-  HDRS
-    ../remainder.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  remainderf
-  SRCS
-    remainderf.cpp
-  HDRS
-    ../remainderf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  rint
-  SRCS
-    rint.cpp
-  HDRS
-    ../rint.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  rintf
-  SRCS
-    rintf.cpp
-  HDRS
-    ../rintf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  round
-  SRCS
-    round.cpp
-  HDRS
-    ../round.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  sinh
-  SRCS
-    sinh.cpp
-  HDRS
-    ../sinh.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  sqrt
-  SRCS
-    sqrt.cpp
-  HDRS
-    ../sqrt.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  sqrtf
-  SRCS
-    sqrtf.cpp
-  HDRS
-    ../sqrtf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  tan
-  SRCS
-    tan.cpp
-  HDRS
-    ../tan.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  tanh
-  SRCS
-    tanh.cpp
-  HDRS
-    ../tanh.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  trunc
-  SRCS
-    trunc.cpp
-  HDRS
-    ../trunc.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  truncf
-  SRCS
-    truncf.cpp
-  HDRS
-    ../truncf.h
-  COMPILE_OPTIONS
-    -O2
-)
diff --git a/libc/src/math/gpu/vendor/amdgpu/amdgpu.h b/libc/src/math/gpu/vendor/amdgpu/amdgpu.h
deleted file mode 100644
index 43961fc75982..000000000000
--- a/libc/src/math/gpu/vendor/amdgpu/amdgpu.h
+++ /dev/null
@@ -1,127 +0,0 @@
-//===-- AMDGPU specific definitions for math support ----------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_MATH_GPU_AMDGPU_H
-#define LLVM_LIBC_SRC_MATH_GPU_AMDGPU_H
-
-#include "declarations.h"
-#include "platform.h"
-
-#include "src/__support/macros/attributes.h"
-
-namespace LIBC_NAMESPACE {
-namespace internal {
-LIBC_INLINE double acos(double x) { return __ocml_acos_f64(x); }
-LIBC_INLINE float acosf(float x) { return __ocml_acos_f32(x); }
-LIBC_INLINE double acosh(double x) { return __ocml_acosh_f64(x); }
-LIBC_INLINE float acoshf(float x) { return __ocml_acosh_f32(x); }
-LIBC_INLINE double asin(double x) { return __ocml_asin_f64(x); }
-LIBC_INLINE float asinf(float x) { return __ocml_asin_f32(x); }
-LIBC_INLINE double asinh(double x) { return __ocml_asinh_f64(x); }
-LIBC_INLINE float asinhf(float x) { return __ocml_asinh_f32(x); }
-LIBC_INLINE double atan(double x) { return __ocml_atan_f64(x); }
-LIBC_INLINE float atanf(float x) { return __ocml_atan_f32(x); }
-LIBC_INLINE double atan2(double x, double y) { return __ocml_atan2_f64(x, y); }
-LIBC_INLINE float atan2f(float x, float y) { return __ocml_atan2_f32(x, y); }
-LIBC_INLINE double atanh(double x) { return __ocml_atanh_f64(x); }
-LIBC_INLINE float atanhf(float x) { return __ocml_atanh_f32(x); }
-LIBC_INLINE double cos(double x) { return __ocml_cos_f64(x); }
-LIBC_INLINE float cosf(float x) { return __ocml_cos_f32(x); }
-LIBC_INLINE double cosh(double x) { return __ocml_cosh_f64(x); }
-LIBC_INLINE float coshf(float x) { return __ocml_cosh_f32(x); }
-LIBC_INLINE double erf(double x) { return __ocml_erf_f64(x); }
-LIBC_INLINE float erff(float x) { return __ocml_erf_f32(x); }
-LIBC_INLINE double exp(double x) { return __builtin_exp(x); }
-LIBC_INLINE float expf(float x) { return __builtin_expf(x); }
-LIBC_INLINE double exp2(double x) { return __ocml_exp2_f64(x); }
-LIBC_INLINE float exp2f(float x) { return __ocml_exp2_f32(x); }
-LIBC_INLINE double exp10(double x) { return __ocml_exp10_f64(x); }
-LIBC_INLINE float exp10f(float x) { return __ocml_exp10_f32(x); }
-LIBC_INLINE double expm1(double x) { return __ocml_expm1_f64(x); }
-LIBC_INLINE float expm1f(float x) { return __ocml_expm1_f32(x); }
-LIBC_INLINE double fdim(double x, double y) { return __ocml_fdim_f64(x, y); }
-LIBC_INLINE float fdimf(float x, float y) { return __ocml_fdim_f32(x, y); }
-LIBC_INLINE double hypot(double x, double y) { return __ocml_hypot_f64(x, y); }
-LIBC_INLINE float hypotf(float x, float y) { return __ocml_hypot_f32(x, y); }
-LIBC_INLINE int ilogb(double x) { return __ocml_ilogb_f64(x); }
-LIBC_INLINE int ilogbf(float x) { return __ocml_ilogb_f32(x); }
-LIBC_INLINE double ldexp(double x, int i) { return __builtin_ldexp(x, i); }
-LIBC_INLINE float ldexpf(float x, int i) { return __builtin_ldexpf(x, i); }
-LIBC_INLINE long long llrint(double x) {
-  return static_cast<long long>(__builtin_rint(x));
-}
-LIBC_INLINE long long llrintf(float x) {
-  return static_cast<long long>(__builtin_rintf(x));
-}
-LIBC_INLINE double log10(double x) { return __ocml_log10_f64(x); }
-LIBC_INLINE float log10f(float x) { return __ocml_log10_f32(x); }
-LIBC_INLINE double log1p(double x) { return __ocml_log1p_f64(x); }
-LIBC_INLINE float log1pf(float x) { return __ocml_log1p_f32(x); }
-LIBC_INLINE double log2(double x) { return __ocml_log2_f64(x); }
-LIBC_INLINE float log2f(float x) { return __ocml_log2_f32(x); }
-LIBC_INLINE double log(double x) { return __ocml_log_f64(x); }
-LIBC_INLINE float logf(float x) { return __ocml_log_f32(x); }
-LIBC_INLINE long lrint(double x) {
-  return static_cast<long>(__builtin_rint(x));
-}
-LIBC_INLINE long lrintf(float x) {
-  return static_cast<long>(__builtin_rintf(x));
-}
-LIBC_INLINE double nextafter(double x, double y) {
-  return __ocml_nextafter_f64(x, y);
-}
-LIBC_INLINE float nextafterf(float x, float y) {
-  return __ocml_nextafter_f32(x, y);
-}
-LIBC_INLINE double pow(double x, double y) { return __ocml_pow_f64(x, y); }
-LIBC_INLINE float powf(float x, float y) { return __ocml_pow_f32(x, y); }
-LIBC_INLINE double sin(double x) { return __ocml_sin_f64(x); }
-LIBC_INLINE float sinf(float x) { return __ocml_sin_f32(x); }
-LIBC_INLINE void sincos(double x, double *sinptr, double *cosptr) {
-  *sinptr = __ocml_sincos_f64(x, cosptr);
-}
-LIBC_INLINE void sincosf(float x, float *sinptr, float *cosptr) {
-  *sinptr = __ocml_sincos_f32(x, cosptr);
-}
-LIBC_INLINE double sinh(double x) { return __ocml_sinh_f64(x); }
-LIBC_INLINE float sinhf(float x) { return __ocml_sinh_f32(x); }
-LIBC_INLINE double tan(double x) { return __ocml_tan_f64(x); }
-LIBC_INLINE float tanf(float x) { return __ocml_tan_f32(x); }
-LIBC_INLINE double tanh(double x) { return __ocml_tanh_f64(x); }
-LIBC_INLINE float tanhf(float x) { return __ocml_tanh_f32(x); }
-LIBC_INLINE double scalbn(double x, int i) {
-  return __builtin_amdgcn_ldexp(x, i);
-}
-LIBC_INLINE float scalbnf(float x, int i) {
-  return __builtin_amdgcn_ldexpf(x, i);
-}
-LIBC_INLINE double frexp(double x, int *nptr) {
-  return __builtin_frexp(x, nptr);
-}
-LIBC_INLINE float frexpf(float x, int *nptr) {
-  return __builtin_frexpf(x, nptr);
-}
-LIBC_INLINE double remquo(double x, double y, int *q) {
-  int tmp;
-  double r = __ocml_remquo_f64(x, y, (gpu::Private<int> *)&tmp);
-  *q = tmp;
-  return r;
-}
-LIBC_INLINE float remquof(float x, float y, int *q) {
-  int tmp;
-  float r = __ocml_remquo_f32(x, y, (gpu::Private<int> *)&tmp);
-  *q = tmp;
-  return r;
-}
-LIBC_INLINE double tgamma(double x) { return __ocml_tgamma_f64(x); }
-LIBC_INLINE float tgammaf(float x) { return __ocml_tgamma_f32(x); }
-
-} // namespace internal
-} // namespace LIBC_NAMESPACE
-
-#endif // LLVM_LIBC_SRC_MATH_GPU_AMDGPU_H
diff --git a/libc/src/math/gpu/vendor/common.h b/libc/src/math/gpu/vendor/common.h
deleted file mode 100644
index 041a9a01c30e..000000000000
--- a/libc/src/math/gpu/vendor/common.h
+++ /dev/null
@@ -1,22 +0,0 @@
-//===-- Common interface for compiling the GPU math -----------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_MATH_GPU_COMMON_H
-#define LLVM_LIBC_SRC_MATH_GPU_COMMON_H
-
-#include "src/__support/macros/properties/architectures.h"
-
-#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
-#include "amdgpu/amdgpu.h"
-#elif defined(LIBC_TARGET_ARCH_IS_NVPTX)
-#include "nvptx/nvptx.h"
-#else
-#error "Unsupported platform"
-#endif
-
-#endif // LLVM_LIBC_SRC_MATH_GPU_COMMON_H
diff --git a/libc/src/math/ilogbf128.h b/libc/src/math/ilogbf128.h
new file mode 100644
index 000000000000..df1145ffc0f8
--- /dev/null
+++ b/libc/src/math/ilogbf128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for ilogbf128 ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_ILOGBF128_H
+#define LLVM_LIBC_SRC_MATH_ILOGBF128_H
+
+#include "src/__support/macros/properties/float.h"
+
+namespace LIBC_NAMESPACE {
+
+int ilogbf128(float128 x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_ILOGBF128_H
diff --git a/libc/src/math/llogb.h b/libc/src/math/llogb.h
new file mode 100644
index 000000000000..2d95877425e5
--- /dev/null
+++ b/libc/src/math/llogb.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for llogb -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_LLOGB_H
+#define LLVM_LIBC_SRC_MATH_LLOGB_H
+
+#include "src/__support/macros/properties/float.h"
+
+namespace LIBC_NAMESPACE {
+
+long llogb(double x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_LLOGB_H
diff --git a/libc/src/math/llogbf.h b/libc/src/math/llogbf.h
new file mode 100644
index 000000000000..512e174b66ee
--- /dev/null
+++ b/libc/src/math/llogbf.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for llogbf ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_LLOGBF_H
+#define LLVM_LIBC_SRC_MATH_LLOGBF_H
+
+#include "src/__support/macros/properties/float.h"
+
+namespace LIBC_NAMESPACE {
+
+long llogbf(float x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_LLOGBF_H
diff --git a/libc/src/math/llogbf128.h b/libc/src/math/llogbf128.h
new file mode 100644
index 000000000000..7fb74d4bbe73
--- /dev/null
+++ b/libc/src/math/llogbf128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for llogbf128 ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_LLOGBF128_H
+#define LLVM_LIBC_SRC_MATH_LLOGBF128_H
+
+#include "src/__support/macros/properties/float.h"
+
+namespace LIBC_NAMESPACE {
+
+long llogbf128(float128 x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_LLOGBF128_H
diff --git a/libc/src/math/llogbl.h b/libc/src/math/llogbl.h
new file mode 100644
index 000000000000..4033100fbe3d
--- /dev/null
+++ b/libc/src/math/llogbl.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for llogbl ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_LLOGBL_H
+#define LLVM_LIBC_SRC_MATH_LLOGBL_H
+
+#include "src/__support/macros/properties/float.h"
+
+namespace LIBC_NAMESPACE {
+
+long llogbl(long double x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_LLOGBL_H
diff --git a/libc/src/math/logbf128.h b/libc/src/math/logbf128.h
new file mode 100644
index 000000000000..8baa076af1bf
--- /dev/null
+++ b/libc/src/math/logbf128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for logbf128  ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_LOGBF128_H
+#define LLVM_LIBC_SRC_MATH_LOGBF128_H
+
+#include "src/__support/macros/properties/float.h"
+
+namespace LIBC_NAMESPACE {
+
+float128 logbf128(float128 x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_LOGBF128_H
diff --git a/libc/src/math/nvptx/CMakeLists.txt b/libc/src/math/nvptx/CMakeLists.txt
new file mode 100644
index 000000000000..194e1fa7af49
--- /dev/null
+++ b/libc/src/math/nvptx/CMakeLists.txt
@@ -0,0 +1,1179 @@
+# Math functions not yet available in the libc project, or those not yet tuned
+# for GPU workloads are provided as wrappers over vendor libraries. If we find
+# them ahead of time we will import them statically. Otherwise, we will keep
+# them as external references and expect them to be resolved by the user when
+# they compile. In the future,we will use implementations from the 'libc'
+# project and not provide these wrappers.
+if(CUDAToolkit_FOUND)
+  set(libdevice_path ${CUDAToolkit_BIN_DIR}/../nvvm/libdevice/libdevice.10.bc)
+  if (EXISTS ${libdevice_path})
+    message(STATUS "Found the CUDA device library. Implementations falling back "
+                   "to the vendor libraries will be resolved statically.")
+    set(bitcode_link_flags 
+        "SHELL:-Xclang -mlink-builtin-bitcode -Xclang ${libdevice_path}")
+  endif()
+else()
+  message(STATUS "Could not find the CUDA device library. Unimplemented "
+                 "functions will be an external reference to the vendor libraries.")
+endif()
+
+add_entrypoint_object(
+  ceil
+  SRCS
+    ceil.cpp
+  HDRS
+    ../ceil.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  ceilf
+  SRCS
+    ceilf.cpp
+  HDRS
+    ../ceilf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  copysign
+  SRCS
+    copysign.cpp
+  HDRS
+    ../copysign.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  copysignf
+  SRCS
+    copysignf.cpp
+  HDRS
+    ../copysignf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fabs
+  SRCS
+    fabs.cpp
+  HDRS
+    ../fabs.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fabsf
+  SRCS
+    fabsf.cpp
+  HDRS
+    ../fabsf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  floor
+  SRCS
+    floor.cpp
+  HDRS
+    ../floor.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  floorf
+  SRCS
+    floorf.cpp
+  HDRS
+    ../floorf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fma
+  SRCS
+    fma.cpp
+  HDRS
+    ../fma.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmaf
+  SRCS
+    fmaf.cpp
+  HDRS
+    ../fmaf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmax
+  SRCS
+    fmax.cpp
+  HDRS
+    ../fmax.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmaxf
+  SRCS
+    fmaxf.cpp
+  HDRS
+    ../fmaxf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmin
+  SRCS
+    fmin.cpp
+  HDRS
+    ../fmin.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fminf
+  SRCS
+    fminf.cpp
+  HDRS
+    ../fminf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmod
+  SRCS
+    fmod.cpp
+  HDRS
+    ../fmod.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmodf
+  SRCS
+    fmodf.cpp
+  HDRS
+    ../fmodf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  lround
+  SRCS
+    lround.cpp
+  HDRS
+    ../lround.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  lroundf
+  SRCS
+    lroundf.cpp
+  HDRS
+    ../lroundf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  llround
+  SRCS
+    llround.cpp
+  HDRS
+    ../llround.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  llroundf
+  SRCS
+    llroundf.cpp
+  HDRS
+    ../llroundf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  modf
+  SRCS
+    modf.cpp
+  HDRS
+    ../modf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  modff
+  SRCS
+    modff.cpp
+  HDRS
+    ../modff.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  nearbyint
+  SRCS
+    nearbyint.cpp
+  HDRS
+    ../nearbyint.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  nearbyintf
+  SRCS
+    nearbyintf.cpp
+  HDRS
+    ../nearbyintf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  remainder
+  SRCS
+    remainder.cpp
+  HDRS
+    ../remainder.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  remainderf
+  SRCS
+    remainderf.cpp
+  HDRS
+    ../remainderf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  rint
+  SRCS
+    rint.cpp
+  HDRS
+    ../rint.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  rintf
+  SRCS
+    rintf.cpp
+  HDRS
+    ../rintf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  round
+  SRCS
+    round.cpp
+  HDRS
+    ../round.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  sqrt
+  SRCS
+    sqrt.cpp
+  HDRS
+    ../sqrt.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  sqrtf
+  SRCS
+    sqrtf.cpp
+  HDRS
+    ../sqrtf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  trunc
+  SRCS
+    trunc.cpp
+  HDRS
+    ../trunc.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  truncf
+  SRCS
+    truncf.cpp
+  HDRS
+    ../truncf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+# The following functions currently are not implemented natively and borrow from
+# existing implementations. This will be removed in the future.
+add_entrypoint_object(
+  acos
+  SRCS
+    acos.cpp
+  HDRS
+    ../acos.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  acosf
+  SRCS
+    acosf.cpp
+  HDRS
+    ../acosf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  acosh
+  SRCS
+    acosh.cpp
+  HDRS
+    ../acosh.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  acoshf
+  SRCS
+    acoshf.cpp
+  HDRS
+    ../acoshf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  asin
+  SRCS
+    asin.cpp
+  HDRS
+    ../asin.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  asinf
+  SRCS
+    asinf.cpp
+  HDRS
+    ../asinf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  asinh
+  SRCS
+    asinh.cpp
+  HDRS
+    ../asinh.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  atan
+  SRCS
+    atan.cpp
+  HDRS
+    ../atan.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  atanf
+  SRCS
+    atanf.cpp
+  HDRS
+    ../atanf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  atan2
+  SRCS
+    atan2.cpp
+  HDRS
+    ../atan2.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  atan2f
+  SRCS
+    atan2f.cpp
+  HDRS
+    ../atan2f.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  atanh
+  SRCS
+    atanh.cpp
+  HDRS
+    ../atanh.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  atanhf
+  SRCS
+    atanhf.cpp
+  HDRS
+    ../atanhf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  cos
+  SRCS
+    cos.cpp
+  HDRS
+    ../cos.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  cosf
+  SRCS
+    cosf.cpp
+  HDRS
+    ../cosf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  cosh
+  SRCS
+    cosh.cpp
+  HDRS
+    ../cosh.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  coshf
+  SRCS
+    coshf.cpp
+  HDRS
+    ../coshf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  erf
+  SRCS
+    erf.cpp
+  HDRS
+    ../erf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  erff
+  SRCS
+    erff.cpp
+  HDRS
+    ../erff.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  exp
+  SRCS
+    exp.cpp
+  HDRS
+    ../exp.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  exp10
+  SRCS
+    exp10.cpp
+  HDRS
+    ../exp10.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  exp10f
+  SRCS
+    exp10f.cpp
+  HDRS
+    ../exp10f.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  exp2
+  SRCS
+    exp2.cpp
+  HDRS
+    ../exp2.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  exp2f
+  SRCS
+    exp2f.cpp
+  HDRS
+    ../exp2f.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  expf
+  SRCS
+    expf.cpp
+  HDRS
+    ../expf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  expm1
+  SRCS
+    expm1.cpp
+  HDRS
+    ../expm1.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  expm1f
+  SRCS
+    expm1f.cpp
+  HDRS
+    ../expm1f.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  fdim
+  SRCS
+    fdim.cpp
+  HDRS
+    ../fdim.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  fdimf
+  SRCS
+    fdimf.cpp
+  HDRS
+    ../fdimf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  hypot
+  SRCS
+    hypot.cpp
+  HDRS
+    ../hypot.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  hypotf
+  SRCS
+    hypotf.cpp
+  HDRS
+    ../hypotf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  ilogb
+  SRCS
+    ilogb.cpp
+  HDRS
+    ../ilogb.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  ilogbf
+  SRCS
+    ilogbf.cpp
+  HDRS
+    ../ilogbf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  log10
+  SRCS
+    log10.cpp
+  HDRS
+    ../log10.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  log10f
+  SRCS
+    log10f.cpp
+  HDRS
+    ../log10f.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  log2
+  SRCS
+    log2.cpp
+  HDRS
+    ../log2.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  log2f
+  SRCS
+    log2f.cpp
+  HDRS
+    ../log2f.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  log
+  SRCS
+    log.cpp
+  HDRS
+    ../log.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  logf
+  SRCS
+    logf.cpp
+  HDRS
+    ../logf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  lrint
+  SRCS
+    lrint.cpp
+  HDRS
+    ../lrint.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  lrintf
+  SRCS
+    lrintf.cpp
+  HDRS
+    ../lrintf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  ldexp
+  SRCS
+    ldexp.cpp
+  HDRS
+    ../ldexp.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  ldexpf
+  SRCS
+    ldexpf.cpp
+  HDRS
+    ../ldexpf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  log1p
+  SRCS
+    log1p.cpp
+  HDRS
+    ../log1p.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  log1pf
+  SRCS
+    log1pf.cpp
+  HDRS
+    ../log1pf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  llrint
+  SRCS
+    llrint.cpp
+  HDRS
+    ../llrint.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  llrintf
+  SRCS
+    llrintf.cpp
+  HDRS
+    ../llrintf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  remquo
+  SRCS
+    remquo.cpp
+  HDRS
+    ../remquo.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  remquof
+  SRCS
+    remquof.cpp
+  HDRS
+    ../remquof.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  scalbn
+  SRCS
+    scalbn.cpp
+  HDRS
+    ../scalbn.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  scalbnf
+  SRCS
+    scalbnf.cpp
+  HDRS
+    ../scalbnf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+
+add_entrypoint_object(
+  nextafter
+  SRCS
+    nextafter.cpp
+  HDRS
+    ../nextafter.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  nextafterf
+  SRCS
+    nextafterf.cpp
+  HDRS
+    ../nextafterf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  pow
+  SRCS
+    pow.cpp
+  HDRS
+    ../pow.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  powf
+  SRCS
+    powf.cpp
+  HDRS
+    ../powf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  sin
+  SRCS
+    sin.cpp
+  HDRS
+    ../sin.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  sinf
+  SRCS
+    sinf.cpp
+  HDRS
+    ../sinf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  sincos
+  SRCS
+    sincos.cpp
+  HDRS
+    ../sincos.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  sincosf
+  SRCS
+    sincosf.cpp
+  HDRS
+    ../sincosf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  sinh
+  SRCS
+    sinh.cpp
+  HDRS
+    ../sinh.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  sinhf
+  SRCS
+    sinhf.cpp
+  HDRS
+    ../sinhf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  tan
+  SRCS
+    tan.cpp
+  HDRS
+    ../tan.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  tanf
+  SRCS
+    tanf.cpp
+  HDRS
+    ../tanf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  tanh
+  SRCS
+    tanh.cpp
+  HDRS
+    ../tanh.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  tanhf
+  SRCS
+    tanhf.cpp
+  HDRS
+    ../tanhf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  tgamma
+  SRCS
+    tgamma.cpp
+  HDRS
+    ../tgamma.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  tgammaf
+  SRCS
+    tgammaf.cpp
+  HDRS
+    ../tgammaf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  frexp
+  SRCS
+    frexp.cpp
+  HDRS
+    ../frexp.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  frexpf
+  SRCS
+    frexpf.cpp
+  HDRS
+    ../frexpf.h
+  COMPILE_OPTIONS
+    ${itcode_link_flags}
+    -O2
+  VENDOR
+)
diff --git a/libc/src/math/gpu/vendor/acos.cpp b/libc/src/math/nvptx/acos.cpp
index 83b674fa6ae3..da2c7952feba 100644
--- a/libc/src/math/gpu/vendor/acos.cpp
+++ b/libc/src/math/nvptx/acos.cpp
@@ -9,10 +9,10 @@
 #include "src/math/acos.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, acos, (double x)) { return internal::acos(x); }
+LLVM_LIBC_FUNCTION(double, acos, (double x)) { return __nv_acos(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/acosf.cpp b/libc/src/math/nvptx/acosf.cpp
index ac629761a439..8a4125f03b8c 100644
--- a/libc/src/math/gpu/vendor/acosf.cpp
+++ b/libc/src/math/nvptx/acosf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/acosf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, acosf, (float x)) { return internal::acosf(x); }
+LLVM_LIBC_FUNCTION(float, acosf, (float x)) { return __nv_acosf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/acosh.cpp b/libc/src/math/nvptx/acosh.cpp
index cc1b8b572b3d..06f6e2922f56 100644
--- a/libc/src/math/gpu/vendor/acosh.cpp
+++ b/libc/src/math/nvptx/acosh.cpp
@@ -9,10 +9,10 @@
 #include "src/math/acosh.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, acosh, (double x)) { return internal::acosh(x); }
+LLVM_LIBC_FUNCTION(double, acosh, (double x)) { return __nv_acosh(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/acoshf.cpp b/libc/src/math/nvptx/acoshf.cpp
index a0384f89eed3..00e8053a5078 100644
--- a/libc/src/math/gpu/vendor/acoshf.cpp
+++ b/libc/src/math/nvptx/acoshf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/acoshf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, acoshf, (float x)) { return internal::acoshf(x); }
+LLVM_LIBC_FUNCTION(float, acoshf, (float x)) { return __nv_acoshf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/asin.cpp b/libc/src/math/nvptx/asin.cpp
index 24a8a136e88e..74d92fded72b 100644
--- a/libc/src/math/gpu/vendor/asin.cpp
+++ b/libc/src/math/nvptx/asin.cpp
@@ -9,10 +9,10 @@
 #include "src/math/asin.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, asin, (double x)) { return internal::asin(x); }
+LLVM_LIBC_FUNCTION(double, asin, (double x)) { return __nv_asin(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/asinf.cpp b/libc/src/math/nvptx/asinf.cpp
index 595a48f82744..30544bc1313d 100644
--- a/libc/src/math/gpu/vendor/asinf.cpp
+++ b/libc/src/math/nvptx/asinf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/asinf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, asinf, (float x)) { return internal::asinf(x); }
+LLVM_LIBC_FUNCTION(float, asinf, (float x)) { return __nv_asinf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/asinh.cpp b/libc/src/math/nvptx/asinh.cpp
index f417d9fd8c1f..0e5dbb47e667 100644
--- a/libc/src/math/gpu/vendor/asinh.cpp
+++ b/libc/src/math/nvptx/asinh.cpp
@@ -9,10 +9,10 @@
 #include "src/math/asinh.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, asinh, (double x)) { return internal::asinh(x); }
+LLVM_LIBC_FUNCTION(double, asinh, (double x)) { return __nv_asinh(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/asinhf.cpp b/libc/src/math/nvptx/asinhf.cpp
index 78e5543cf366..6648108646cd 100644
--- a/libc/src/math/gpu/vendor/asinhf.cpp
+++ b/libc/src/math/nvptx/asinhf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/asinhf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, asinhf, (float x)) { return internal::asinhf(x); }
+LLVM_LIBC_FUNCTION(float, asinhf, (float x)) { return __nv_asinhf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/atan.cpp b/libc/src/math/nvptx/atan.cpp
index 45d7f02c02cd..3af793a53ae5 100644
--- a/libc/src/math/gpu/vendor/atan.cpp
+++ b/libc/src/math/nvptx/atan.cpp
@@ -9,10 +9,10 @@
 #include "src/math/atan.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, atan, (double x)) { return internal::atan(x); }
+LLVM_LIBC_FUNCTION(double, atan, (double x)) { return __nv_atan(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/atan2.cpp b/libc/src/math/nvptx/atan2.cpp
index 94e215e52ca6..0c54e0e04899 100644
--- a/libc/src/math/gpu/vendor/atan2.cpp
+++ b/libc/src/math/nvptx/atan2.cpp
@@ -9,12 +9,12 @@
 #include "src/math/atan2.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, atan2, (double x, double y)) {
-  return internal::atan2(x, y);
+  return __nv_atan2(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/atan2f.cpp b/libc/src/math/nvptx/atan2f.cpp
index 70caa568e32d..c3327d92c97e 100644
--- a/libc/src/math/gpu/vendor/atan2f.cpp
+++ b/libc/src/math/nvptx/atan2f.cpp
@@ -9,12 +9,12 @@
 #include "src/math/atan2f.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, atan2f, (float x, float y)) {
-  return internal::atan2f(x, y);
+  return __nv_atan2f(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/atanf.cpp b/libc/src/math/nvptx/atanf.cpp
index 132c43d9e3af..559526297732 100644
--- a/libc/src/math/gpu/vendor/atanf.cpp
+++ b/libc/src/math/nvptx/atanf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/atanf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, atanf, (float x)) { return internal::atanf(x); }
+LLVM_LIBC_FUNCTION(float, atanf, (float x)) { return __nv_atanf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/atanh.cpp b/libc/src/math/nvptx/atanh.cpp
index 07a75fcbbfc7..6699d959df18 100644
--- a/libc/src/math/gpu/vendor/atanh.cpp
+++ b/libc/src/math/nvptx/atanh.cpp
@@ -9,10 +9,10 @@
 #include "src/math/atanh.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, atanh, (double x)) { return internal::atanh(x); }
+LLVM_LIBC_FUNCTION(double, atanh, (double x)) { return __nv_atanh(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/atanhf.cpp b/libc/src/math/nvptx/atanhf.cpp
index 521c4133243d..526b7b3e3712 100644
--- a/libc/src/math/gpu/vendor/atanhf.cpp
+++ b/libc/src/math/nvptx/atanhf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/atanhf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, atanhf, (float x)) { return internal::atanhf(x); }
+LLVM_LIBC_FUNCTION(float, atanhf, (float x)) { return __nv_atanhf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/ceil.cpp b/libc/src/math/nvptx/ceil.cpp
new file mode 100644
index 000000000000..ad1407d61f62
--- /dev/null
+++ b/libc/src/math/nvptx/ceil.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the ceil function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/ceil.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, ceil, (double x)) { return __builtin_ceil(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/ceilf.cpp b/libc/src/math/nvptx/ceilf.cpp
new file mode 100644
index 000000000000..c4fc58d93603
--- /dev/null
+++ b/libc/src/math/nvptx/ceilf.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the ceilf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/ceilf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, ceilf, (float x)) { return __builtin_ceilf(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/copysign.cpp b/libc/src/math/nvptx/copysign.cpp
new file mode 100644
index 000000000000..6f804bdb90a1
--- /dev/null
+++ b/libc/src/math/nvptx/copysign.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the copysign function for GPU -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/copysign.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, copysign, (double x, double y)) {
+  return __builtin_copysign(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/copysignf.cpp b/libc/src/math/nvptx/copysignf.cpp
new file mode 100644
index 000000000000..4d7e132462ac
--- /dev/null
+++ b/libc/src/math/nvptx/copysignf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the copysignf function for GPU ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/copysignf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, copysignf, (float x, float y)) {
+  return __builtin_copysignf(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/cos.cpp b/libc/src/math/nvptx/cos.cpp
index 37c7507911ec..185ad3cf9215 100644
--- a/libc/src/math/gpu/vendor/cos.cpp
+++ b/libc/src/math/nvptx/cos.cpp
@@ -9,10 +9,10 @@
 #include "src/math/cos.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, cos, (double x)) { return internal::cos(x); }
+LLVM_LIBC_FUNCTION(double, cos, (double x)) { return __nv_cos(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/cosf.cpp b/libc/src/math/nvptx/cosf.cpp
index 1bd42ba37e92..3d34de4be51b 100644
--- a/libc/src/math/gpu/vendor/cosf.cpp
+++ b/libc/src/math/nvptx/cosf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/cosf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, cosf, (float x)) { return internal::cosf(x); }
+LLVM_LIBC_FUNCTION(float, cosf, (float x)) { return __nv_cosf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/cosh.cpp b/libc/src/math/nvptx/cosh.cpp
index 3be05e58b51d..179864c5f910 100644
--- a/libc/src/math/gpu/vendor/cosh.cpp
+++ b/libc/src/math/nvptx/cosh.cpp
@@ -9,10 +9,10 @@
 #include "src/math/cosh.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, cosh, (double x)) { return internal::cosh(x); }
+LLVM_LIBC_FUNCTION(double, cosh, (double x)) { return __nv_cosh(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/coshf.cpp b/libc/src/math/nvptx/coshf.cpp
index 1b945bbd7c47..9147499db97c 100644
--- a/libc/src/math/gpu/vendor/coshf.cpp
+++ b/libc/src/math/nvptx/coshf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/coshf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, coshf, (float x)) { return internal::coshf(x); }
+LLVM_LIBC_FUNCTION(float, coshf, (float x)) { return __nv_coshf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/nvptx/declarations.h b/libc/src/math/nvptx/declarations.h
index 9cb2be67b85b..9cb2be67b85b 100644
--- a/libc/src/math/gpu/vendor/nvptx/declarations.h
+++ b/libc/src/math/nvptx/declarations.h
diff --git a/libc/src/math/gpu/vendor/erf.cpp b/libc/src/math/nvptx/erf.cpp
index 190321ca2599..5ea0177d5cd3 100644
--- a/libc/src/math/gpu/vendor/erf.cpp
+++ b/libc/src/math/nvptx/erf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/erf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, erf, (double x)) { return internal::erf(x); }
+LLVM_LIBC_FUNCTION(double, erf, (double x)) { return __nv_erf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/erff.cpp b/libc/src/math/nvptx/erff.cpp
index a5a08be54b07..03fdceace8e9 100644
--- a/libc/src/math/gpu/vendor/erff.cpp
+++ b/libc/src/math/nvptx/erff.cpp
@@ -9,10 +9,10 @@
 #include "src/math/erff.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, erff, (float x)) { return internal::erff(x); }
+LLVM_LIBC_FUNCTION(float, erff, (float x)) { return __nv_erff(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/exp.cpp b/libc/src/math/nvptx/exp.cpp
new file mode 100644
index 000000000000..6bbe87ba2e78
--- /dev/null
+++ b/libc/src/math/nvptx/exp.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU exp function ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/exp.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, exp, (double x)) { return __nv_exp(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/exp10.cpp b/libc/src/math/nvptx/exp10.cpp
index 8557a33f0188..11bb734fd113 100644
--- a/libc/src/math/gpu/vendor/exp10.cpp
+++ b/libc/src/math/nvptx/exp10.cpp
@@ -9,10 +9,10 @@
 #include "src/math/exp10.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, exp10, (double x)) { return internal::exp10(x); }
+LLVM_LIBC_FUNCTION(double, exp10, (double x)) { return __nv_exp10(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/exp10f.cpp b/libc/src/math/nvptx/exp10f.cpp
index 844809355087..4e3121a0b46e 100644
--- a/libc/src/math/gpu/vendor/exp10f.cpp
+++ b/libc/src/math/nvptx/exp10f.cpp
@@ -9,10 +9,10 @@
 #include "src/math/exp10f.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, exp10f, (float x)) { return internal::exp10f(x); }
+LLVM_LIBC_FUNCTION(float, exp10f, (float x)) { return __nv_exp10f(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/exp2.cpp b/libc/src/math/nvptx/exp2.cpp
index ffa23d810a9b..35fc27b3a26a 100644
--- a/libc/src/math/gpu/vendor/exp2.cpp
+++ b/libc/src/math/nvptx/exp2.cpp
@@ -9,10 +9,10 @@
 #include "src/math/exp2.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, exp2, (double x)) { return internal::exp2(x); }
+LLVM_LIBC_FUNCTION(double, exp2, (double x)) { return __nv_exp2(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/exp2f.cpp b/libc/src/math/nvptx/exp2f.cpp
index cb61557383df..8d137346fe00 100644
--- a/libc/src/math/gpu/vendor/exp2f.cpp
+++ b/libc/src/math/nvptx/exp2f.cpp
@@ -9,10 +9,10 @@
 #include "src/math/exp2f.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, exp2f, (float x)) { return internal::exp2f(x); }
+LLVM_LIBC_FUNCTION(float, exp2f, (float x)) { return __nv_exp2f(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/expf.cpp b/libc/src/math/nvptx/expf.cpp
new file mode 100644
index 000000000000..a6362bd73461
--- /dev/null
+++ b/libc/src/math/nvptx/expf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the expf function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/expf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, expf, (float x)) { return __nv_expf(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/expm1.cpp b/libc/src/math/nvptx/expm1.cpp
index 6ac5f753b9e4..0331903b8fd8 100644
--- a/libc/src/math/gpu/vendor/expm1.cpp
+++ b/libc/src/math/nvptx/expm1.cpp
@@ -9,10 +9,10 @@
 #include "src/math/expm1.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, expm1, (double x)) { return internal::expm1(x); }
+LLVM_LIBC_FUNCTION(double, expm1, (double x)) { return __nv_expm1(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/expm1f.cpp b/libc/src/math/nvptx/expm1f.cpp
index c5497797dbe1..7b74c548f3df 100644
--- a/libc/src/math/gpu/vendor/expm1f.cpp
+++ b/libc/src/math/nvptx/expm1f.cpp
@@ -9,10 +9,10 @@
 #include "src/math/expm1f.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, expm1f, (float x)) { return internal::expm1f(x); }
+LLVM_LIBC_FUNCTION(float, expm1f, (float x)) { return __nv_expm1f(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/fabs.cpp b/libc/src/math/nvptx/fabs.cpp
new file mode 100644
index 000000000000..c0d063d50ae5
--- /dev/null
+++ b/libc/src/math/nvptx/fabs.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the fabs function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fabs.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, fabs, (double x)) { return __builtin_fabs(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/fabsf.cpp b/libc/src/math/nvptx/fabsf.cpp
new file mode 100644
index 000000000000..398ffd0c74c0
--- /dev/null
+++ b/libc/src/math/nvptx/fabsf.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the fabsf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fabsf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, fabsf, (float x)) { return __builtin_fabsf(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/fdim.cpp b/libc/src/math/nvptx/fdim.cpp
index f30dafb46e54..2f1ff5180026 100644
--- a/libc/src/math/gpu/vendor/fdim.cpp
+++ b/libc/src/math/nvptx/fdim.cpp
@@ -9,12 +9,12 @@
 #include "src/math/fdim.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, fdim, (double x, double y)) {
-  return internal::fdim(x, y);
+  return __nv_fdim(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/fdimf.cpp b/libc/src/math/nvptx/fdimf.cpp
index e30736206a9a..c24e6be72482 100644
--- a/libc/src/math/gpu/vendor/fdimf.cpp
+++ b/libc/src/math/nvptx/fdimf.cpp
@@ -9,12 +9,12 @@
 #include "src/math/fdimf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, fdimf, (float x, float y)) {
-  return internal::fdimf(x, y);
+  return __nv_fdimf(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/floor.cpp b/libc/src/math/nvptx/floor.cpp
new file mode 100644
index 000000000000..eada89c178d7
--- /dev/null
+++ b/libc/src/math/nvptx/floor.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the floor function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/floor.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, floor, (double x)) { return __builtin_floor(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/floorf.cpp b/libc/src/math/nvptx/floorf.cpp
new file mode 100644
index 000000000000..a5611c515a88
--- /dev/null
+++ b/libc/src/math/nvptx/floorf.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the floorf function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/floorf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, floorf, (float x)) { return __builtin_floorf(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/fma.cpp b/libc/src/math/nvptx/fma.cpp
new file mode 100644
index 000000000000..41a6ddf60dbc
--- /dev/null
+++ b/libc/src/math/nvptx/fma.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the fma function for GPU ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fma.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, fma, (double x, double y, double z)) {
+  return __builtin_fma(x, y, z);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/fmaf.cpp b/libc/src/math/nvptx/fmaf.cpp
new file mode 100644
index 000000000000..c948e32f77eb
--- /dev/null
+++ b/libc/src/math/nvptx/fmaf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the fmaf function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmaf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, fmaf, (float x, float y, float z)) {
+  return __builtin_fmaf(x, y, z);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/fmax.cpp b/libc/src/math/nvptx/fmax.cpp
new file mode 100644
index 000000000000..09624cc6f092
--- /dev/null
+++ b/libc/src/math/nvptx/fmax.cpp
@@ -0,0 +1,25 @@
+//===-- Implementation of the fmax function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmax.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, fmax, (double x, double y)) {
+  // FIXME: The builtin function does not correctly handle the +/-0.0 case.
+  if (LIBC_UNLIKELY(x == y))
+    return cpp::bit_cast<double>(cpp::bit_cast<uint64_t>(x) &
+                                 cpp::bit_cast<uint64_t>(y));
+  return __builtin_fmax(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/fmaxf.cpp b/libc/src/math/nvptx/fmaxf.cpp
new file mode 100644
index 000000000000..f6ed46699a04
--- /dev/null
+++ b/libc/src/math/nvptx/fmaxf.cpp
@@ -0,0 +1,25 @@
+//===-- Implementation of the fmaxf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmaxf.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, fmaxf, (float x, float y)) {
+  // FIXME: The builtin function does not correctly handle the +/-0.0 case.
+  if (LIBC_UNLIKELY(x == y))
+    return cpp::bit_cast<float>(cpp::bit_cast<uint32_t>(x) &
+                                cpp::bit_cast<uint32_t>(y));
+  return __builtin_fmaxf(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/fmin.cpp b/libc/src/math/nvptx/fmin.cpp
new file mode 100644
index 000000000000..8977ff7a066c
--- /dev/null
+++ b/libc/src/math/nvptx/fmin.cpp
@@ -0,0 +1,25 @@
+//===-- Implementation of the fmin function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmin.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, fmin, (double x, double y)) {
+  // FIXME: The builtin function does not correctly handle the +/-0.0 case.
+  if (LIBC_UNLIKELY(x == y))
+    return cpp::bit_cast<double>(cpp::bit_cast<uint64_t>(x) |
+                                 cpp::bit_cast<uint64_t>(y));
+  return __builtin_fmin(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/fminf.cpp b/libc/src/math/nvptx/fminf.cpp
new file mode 100644
index 000000000000..3be55257f616
--- /dev/null
+++ b/libc/src/math/nvptx/fminf.cpp
@@ -0,0 +1,25 @@
+//===-- Implementation of the fminf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fminf.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, fminf, (float x, float y)) {
+  // FIXME: The builtin function does not correctly handle the +/-0.0 case.
+  if (LIBC_UNLIKELY(x == y))
+    return cpp::bit_cast<float>(cpp::bit_cast<uint32_t>(x) |
+                                cpp::bit_cast<uint32_t>(y));
+  return __builtin_fminf(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/fmaxf.cpp b/libc/src/math/nvptx/fmod.cpp
index 67178b3e2735..0654cdd2abe0 100644
--- a/libc/src/math/gpu/fmaxf.cpp
+++ b/libc/src/math/nvptx/fmod.cpp
@@ -1,4 +1,4 @@
-//===-- Implementation of the fmaxf function for GPU ----------------------===//
+//===-- Implementation of the fmod function for GPU -----------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/math/fmaxf.h"
+#include "src/math/fmod.h"
 #include "src/__support/common.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, fmaxf, (float x, float y)) {
-  return __builtin_fmaxf(x, y);
+LLVM_LIBC_FUNCTION(double, fmod, (double x, double y)) {
+  return __builtin_fmod(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/fmin.cpp b/libc/src/math/nvptx/fmodf.cpp
index 7303adcd347e..b689046468fb 100644
--- a/libc/src/math/gpu/fmin.cpp
+++ b/libc/src/math/nvptx/fmodf.cpp
@@ -1,4 +1,4 @@
-//===-- Implementation of the fmin function for GPU -----------------------===//
+//===-- Implementation of the fmodf function for GPU ----------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/math/fmin.h"
+#include "src/math/fmodf.h"
 #include "src/__support/common.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, fmin, (double x, double y)) {
-  return __builtin_fmin(x, y);
+LLVM_LIBC_FUNCTION(float, fmodf, (float x, float y)) {
+  return __builtin_fmodf(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/frexp.cpp b/libc/src/math/nvptx/frexp.cpp
index 5fc2c1409c6e..2423961f7c61 100644
--- a/libc/src/math/gpu/vendor/frexp.cpp
+++ b/libc/src/math/nvptx/frexp.cpp
@@ -9,12 +9,12 @@
 #include "src/math/frexp.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, frexp, (double x, int *p)) {
-  return internal::frexp(x, p);
+  return __nv_frexp(x, p);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/frexpf.cpp b/libc/src/math/nvptx/frexpf.cpp
index e928d375e03d..f1ea29068777 100644
--- a/libc/src/math/gpu/vendor/frexpf.cpp
+++ b/libc/src/math/nvptx/frexpf.cpp
@@ -9,12 +9,12 @@
 #include "src/math/frexpf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, frexpf, (float x, int *p)) {
-  return internal::frexpf(x, p);
+  return __nv_frexpf(x, p);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/hypot.cpp b/libc/src/math/nvptx/hypot.cpp
index 45b629e3e285..28bf04aa4a2b 100644
--- a/libc/src/math/gpu/vendor/hypot.cpp
+++ b/libc/src/math/nvptx/hypot.cpp
@@ -9,12 +9,12 @@
 #include "src/math/hypot.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, hypot, (double x, double y)) {
-  return internal::hypot(x, y);
+  return __nv_hypot(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/hypotf.cpp b/libc/src/math/nvptx/hypotf.cpp
index 533e9dcb8dbf..c506aab1acc0 100644
--- a/libc/src/math/gpu/vendor/hypotf.cpp
+++ b/libc/src/math/nvptx/hypotf.cpp
@@ -9,12 +9,12 @@
 #include "src/math/hypotf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, hypotf, (float x, float y)) {
-  return internal::hypotf(x, y);
+  return __nv_hypotf(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/ilogb.cpp b/libc/src/math/nvptx/ilogb.cpp
index 1d075027b41c..fc75e2fd847a 100644
--- a/libc/src/math/gpu/vendor/ilogb.cpp
+++ b/libc/src/math/nvptx/ilogb.cpp
@@ -9,10 +9,10 @@
 #include "src/math/ilogb.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(int, ilogb, (double x)) { return internal::ilogb(x); }
+LLVM_LIBC_FUNCTION(int, ilogb, (double x)) { return __nv_ilogb(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/ilogbf.cpp b/libc/src/math/nvptx/ilogbf.cpp
index 8dc2ff0a374a..3d14fcfa878f 100644
--- a/libc/src/math/gpu/vendor/ilogbf.cpp
+++ b/libc/src/math/nvptx/ilogbf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/ilogbf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(int, ilogbf, (float x)) { return internal::ilogbf(x); }
+LLVM_LIBC_FUNCTION(int, ilogbf, (float x)) { return __nv_ilogbf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/ldexp.cpp b/libc/src/math/nvptx/ldexp.cpp
index f760a4256330..761dc4816b7a 100644
--- a/libc/src/math/gpu/vendor/ldexp.cpp
+++ b/libc/src/math/nvptx/ldexp.cpp
@@ -9,12 +9,12 @@
 #include "src/math/ldexp.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, ldexp, (double x, int y)) {
-  return internal::ldexp(x, y);
+  return __nv_ldexp(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/ldexpf.cpp b/libc/src/math/nvptx/ldexpf.cpp
index d00d39115ffc..2d4c556a2714 100644
--- a/libc/src/math/gpu/vendor/ldexpf.cpp
+++ b/libc/src/math/nvptx/ldexpf.cpp
@@ -9,12 +9,12 @@
 #include "src/math/ldexpf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, ldexpf, (float x, int y)) {
-  return internal::ldexpf(x, y);
+  return __nv_ldexpf(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/llrint.cpp b/libc/src/math/nvptx/llrint.cpp
new file mode 100644
index 000000000000..8f95e75e779b
--- /dev/null
+++ b/libc/src/math/nvptx/llrint.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the llrint function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/llrint.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long long, llrint, (double x)) { return __nv_llrint(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/llrintf.cpp b/libc/src/math/nvptx/llrintf.cpp
new file mode 100644
index 000000000000..1432ffbd1bda
--- /dev/null
+++ b/libc/src/math/nvptx/llrintf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the llrintf function for GPU --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/llrintf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long long, llrintf, (float x)) { return __nv_llrintf(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/fminf.cpp b/libc/src/math/nvptx/llround.cpp
index bbf0c677b5e3..afd98308730a 100644
--- a/libc/src/math/gpu/fminf.cpp
+++ b/libc/src/math/nvptx/llround.cpp
@@ -1,4 +1,4 @@
-//===-- Implementation of the fminf function for GPU ----------------------===//
+//===-- Implementation of the GPU llround function ------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/math/fminf.h"
+#include "src/math/llround.h"
 #include "src/__support/common.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, fminf, (float x, float y)) {
-  return __builtin_fminf(x, y);
+LLVM_LIBC_FUNCTION(long long, llround, (double x)) {
+  return __builtin_llround(x);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/llroundf.cpp b/libc/src/math/nvptx/llroundf.cpp
new file mode 100644
index 000000000000..897ed15b6928
--- /dev/null
+++ b/libc/src/math/nvptx/llroundf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU llroundf function -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/llroundf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long long, llroundf, (float x)) {
+  return __builtin_lroundf(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/log.cpp b/libc/src/math/nvptx/log.cpp
index a97689abcc21..26b6dfa607b9 100644
--- a/libc/src/math/gpu/vendor/log.cpp
+++ b/libc/src/math/nvptx/log.cpp
@@ -9,10 +9,10 @@
 #include "src/math/log.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, log, (double x)) { return internal::log(x); }
+LLVM_LIBC_FUNCTION(double, log, (double x)) { return __nv_log(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/log10.cpp b/libc/src/math/nvptx/log10.cpp
index c7a917a75cc9..ff2702539567 100644
--- a/libc/src/math/gpu/vendor/log10.cpp
+++ b/libc/src/math/nvptx/log10.cpp
@@ -9,10 +9,10 @@
 #include "src/math/log10.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, log10, (double x)) { return internal::log10(x); }
+LLVM_LIBC_FUNCTION(double, log10, (double x)) { return __nv_log10(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/log10f.cpp b/libc/src/math/nvptx/log10f.cpp
index 489f5f558be1..af903b60a7ce 100644
--- a/libc/src/math/gpu/vendor/log10f.cpp
+++ b/libc/src/math/nvptx/log10f.cpp
@@ -9,10 +9,10 @@
 #include "src/math/log10f.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, log10f, (float x)) { return internal::log10f(x); }
+LLVM_LIBC_FUNCTION(float, log10f, (float x)) { return __nv_log10f(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/log1p.cpp b/libc/src/math/nvptx/log1p.cpp
index 720d23e2f952..47bc96b0d881 100644
--- a/libc/src/math/gpu/vendor/log1p.cpp
+++ b/libc/src/math/nvptx/log1p.cpp
@@ -9,10 +9,10 @@
 #include "src/math/log1p.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, log1p, (double x)) { return internal::log1p(x); }
+LLVM_LIBC_FUNCTION(double, log1p, (double x)) { return __nv_log1p(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/log1pf.cpp b/libc/src/math/nvptx/log1pf.cpp
index 96ad48b529cf..bfa4f7f22d54 100644
--- a/libc/src/math/gpu/vendor/log1pf.cpp
+++ b/libc/src/math/nvptx/log1pf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/log1pf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, log1pf, (float x)) { return internal::log1pf(x); }
+LLVM_LIBC_FUNCTION(float, log1pf, (float x)) { return __nv_log1pf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/log2.cpp b/libc/src/math/nvptx/log2.cpp
index 9fc8a81e7e75..86a980de65d4 100644
--- a/libc/src/math/gpu/vendor/log2.cpp
+++ b/libc/src/math/nvptx/log2.cpp
@@ -9,10 +9,10 @@
 #include "src/math/log2.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, log2, (double x)) { return internal::log2(x); }
+LLVM_LIBC_FUNCTION(double, log2, (double x)) { return __nv_log2(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/log2f.cpp b/libc/src/math/nvptx/log2f.cpp
index 62df41b69b0b..5ce46291610d 100644
--- a/libc/src/math/gpu/vendor/log2f.cpp
+++ b/libc/src/math/nvptx/log2f.cpp
@@ -9,10 +9,10 @@
 #include "src/math/log2f.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, log2f, (float x)) { return internal::log2f(x); }
+LLVM_LIBC_FUNCTION(float, log2f, (float x)) { return __nv_log2f(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/logb.cpp b/libc/src/math/nvptx/logb.cpp
index 5dea57d41b08..b620b16184fc 100644
--- a/libc/src/math/gpu/vendor/logb.cpp
+++ b/libc/src/math/nvptx/logb.cpp
@@ -9,10 +9,10 @@
 #include "src/math/logb.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, logb, (double x)) { return internal::logb(x); }
+LLVM_LIBC_FUNCTION(double, logb, (double x)) { return __nv_logb(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/logbf.cpp b/libc/src/math/nvptx/logbf.cpp
index 1a59df3e09a8..f19f0320db9d 100644
--- a/libc/src/math/gpu/vendor/logbf.cpp
+++ b/libc/src/math/nvptx/logbf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/logbf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, logbf, (float x)) { return internal::logbf(x); }
+LLVM_LIBC_FUNCTION(float, logbf, (float x)) { return __nv_logbf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/logf.cpp b/libc/src/math/nvptx/logf.cpp
index 527b028e100d..6deb482c0ace 100644
--- a/libc/src/math/gpu/vendor/logf.cpp
+++ b/libc/src/math/nvptx/logf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/logf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, logf, (float x)) { return internal::logf(x); }
+LLVM_LIBC_FUNCTION(float, logf, (float x)) { return __nv_logf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/lrint.cpp b/libc/src/math/nvptx/lrint.cpp
index a08996b755b5..8585f4ce53a4 100644
--- a/libc/src/math/gpu/vendor/lrint.cpp
+++ b/libc/src/math/nvptx/lrint.cpp
@@ -9,10 +9,10 @@
 #include "src/math/lrint.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(long, lrint, (double x)) { return internal::lrint(x); }
+LLVM_LIBC_FUNCTION(long, lrint, (double x)) { return __nv_lrint(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/lrintf.cpp b/libc/src/math/nvptx/lrintf.cpp
index 695a9b8202cf..312a9469fade 100644
--- a/libc/src/math/gpu/vendor/lrintf.cpp
+++ b/libc/src/math/nvptx/lrintf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/lrintf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(long, lrintf, (float x)) { return internal::lrintf(x); }
+LLVM_LIBC_FUNCTION(long, lrintf, (float x)) { return __nv_lrintf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/lround.cpp b/libc/src/math/nvptx/lround.cpp
new file mode 100644
index 000000000000..51e8f2245af8
--- /dev/null
+++ b/libc/src/math/nvptx/lround.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the GPU lround function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/lround.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long, lround, (double x)) { return __builtin_lround(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/lroundf.cpp b/libc/src/math/nvptx/lroundf.cpp
new file mode 100644
index 000000000000..2a6fe7200d8c
--- /dev/null
+++ b/libc/src/math/nvptx/lroundf.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the GPU lroundf function ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/lroundf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long, lroundf, (float x)) { return __builtin_lroundf(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/fmax.cpp b/libc/src/math/nvptx/modf.cpp
index a2c35371d12b..07dbbd6059c3 100644
--- a/libc/src/math/gpu/fmax.cpp
+++ b/libc/src/math/nvptx/modf.cpp
@@ -1,4 +1,4 @@
-//===-- Implementation of the fmax function for GPU -----------------------===//
+//===-- Implementation of the GPU modf function ---------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/math/fmax.h"
+#include "src/math/modf.h"
 #include "src/__support/common.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, fmax, (double x, double y)) {
-  return __builtin_fmax(x, y);
+LLVM_LIBC_FUNCTION(double, modf, (double x, double *iptr)) {
+  return __builtin_modf(x, iptr);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/modff.cpp b/libc/src/math/nvptx/modff.cpp
new file mode 100644
index 000000000000..ad35f9006b51
--- /dev/null
+++ b/libc/src/math/nvptx/modff.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU modff function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/modff.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, modff, (float x, float *iptr)) {
+  return __builtin_modff(x, iptr);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/nearbyint.cpp b/libc/src/math/nvptx/nearbyint.cpp
new file mode 100644
index 000000000000..9c7b600df708
--- /dev/null
+++ b/libc/src/math/nvptx/nearbyint.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU nearbyint function ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nearbyint.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, nearbyint, (double x)) {
+  return __builtin_nearbyint(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/nearbyintf.cpp b/libc/src/math/nvptx/nearbyintf.cpp
new file mode 100644
index 000000000000..7fbe9f4f0e0b
--- /dev/null
+++ b/libc/src/math/nvptx/nearbyintf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU nearbyintf function ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nearbyintf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, nearbyintf, (float x)) {
+  return __builtin_nearbyintf(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/nextafter.cpp b/libc/src/math/nvptx/nextafter.cpp
index f88e17f10908..171aaad6f7cc 100644
--- a/libc/src/math/gpu/vendor/nextafter.cpp
+++ b/libc/src/math/nvptx/nextafter.cpp
@@ -9,12 +9,12 @@
 #include "src/math/nextafter.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, nextafter, (double x, double y)) {
-  return internal::nextafter(x, y);
+  return __nv_nextafter(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/nextafterf.cpp b/libc/src/math/nvptx/nextafterf.cpp
index 7a39dc8fc814..a45937c0dc8b 100644
--- a/libc/src/math/gpu/vendor/nextafterf.cpp
+++ b/libc/src/math/nvptx/nextafterf.cpp
@@ -9,12 +9,12 @@
 #include "src/math/nextafterf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, nextafterf, (float x, float y)) {
-  return internal::nextafterf(x, y);
+  return __nv_nextafterf(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/nvptx/nvptx.h b/libc/src/math/nvptx/nvptx.h
index 110d570a84a3..110d570a84a3 100644
--- a/libc/src/math/gpu/vendor/nvptx/nvptx.h
+++ b/libc/src/math/nvptx/nvptx.h
diff --git a/libc/src/math/nvptx/pow.cpp b/libc/src/math/nvptx/pow.cpp
new file mode 100644
index 000000000000..7de3c9e7e544
--- /dev/null
+++ b/libc/src/math/nvptx/pow.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the pow function for GPU ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/pow.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, pow, (double x, double y)) { return __nv_pow(x, y); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/powf.cpp b/libc/src/math/nvptx/powf.cpp
new file mode 100644
index 000000000000..f9f7dbae63ac
--- /dev/null
+++ b/libc/src/math/nvptx/powf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the powf function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/powf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) { return __nv_powf(x, y); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/remainder.cpp b/libc/src/math/nvptx/remainder.cpp
new file mode 100644
index 000000000000..89b235f9c22a
--- /dev/null
+++ b/libc/src/math/nvptx/remainder.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU remainder function ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/remainder.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, remainder, (double x, double y)) {
+  return __builtin_remainder(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/remainderf.cpp b/libc/src/math/nvptx/remainderf.cpp
new file mode 100644
index 000000000000..9fee6f856dc8
--- /dev/null
+++ b/libc/src/math/nvptx/remainderf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU remainderf function ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/remainderf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, remainderf, (float x, float y)) {
+  return __builtin_remainderf(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/remquo.cpp b/libc/src/math/nvptx/remquo.cpp
index e92c9b3c2a6e..da69a20f8f4f 100644
--- a/libc/src/math/gpu/vendor/remquo.cpp
+++ b/libc/src/math/nvptx/remquo.cpp
@@ -9,12 +9,12 @@
 #include "src/math/remquo.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, remquo, (double x, double y, int *quo)) {
-  return internal::remquo(x, y, quo);
+  return __nv_remquo(x, y, quo);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/remquof.cpp b/libc/src/math/nvptx/remquof.cpp
index b234885aa88c..dcfba5d7b5fa 100644
--- a/libc/src/math/gpu/vendor/remquof.cpp
+++ b/libc/src/math/nvptx/remquof.cpp
@@ -9,12 +9,12 @@
 #include "src/math/remquof.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, remquof, (float x, float y, int *quo)) {
-  return internal::remquof(x, y, quo);
+  return __nv_remquof(x, y, quo);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/rint.cpp b/libc/src/math/nvptx/rint.cpp
new file mode 100644
index 000000000000..44d494a8ed57
--- /dev/null
+++ b/libc/src/math/nvptx/rint.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the GPU rint function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/rint.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, rint, (double x)) { return __builtin_rint(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/rintf.cpp b/libc/src/math/nvptx/rintf.cpp
new file mode 100644
index 000000000000..daf98d943605
--- /dev/null
+++ b/libc/src/math/nvptx/rintf.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the GPU rintf function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/rintf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, rintf, (float x)) { return __builtin_rintf(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/round.cpp b/libc/src/math/nvptx/round.cpp
new file mode 100644
index 000000000000..9d8b5582f040
--- /dev/null
+++ b/libc/src/math/nvptx/round.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the GPU round function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/round.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, round, (double x)) { return __builtin_round(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/roundf.cpp b/libc/src/math/nvptx/roundf.cpp
new file mode 100644
index 000000000000..8743e4eb7fb8
--- /dev/null
+++ b/libc/src/math/nvptx/roundf.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the GPU roundf function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/roundf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, roundf, (float x)) { return __builtin_roundf(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/scalbn.cpp b/libc/src/math/nvptx/scalbn.cpp
index 435533a90501..80374db4c1c9 100644
--- a/libc/src/math/gpu/vendor/scalbn.cpp
+++ b/libc/src/math/nvptx/scalbn.cpp
@@ -9,12 +9,12 @@
 #include "src/math/scalbn.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, scalbn, (double x, int y)) {
-  return internal::scalbn(x, y);
+  return __nv_scalbn(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/scalbnf.cpp b/libc/src/math/nvptx/scalbnf.cpp
index 0a4844c8a433..24fa3a5ed698 100644
--- a/libc/src/math/gpu/vendor/scalbnf.cpp
+++ b/libc/src/math/nvptx/scalbnf.cpp
@@ -9,12 +9,12 @@
 #include "src/math/scalbnf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, scalbnf, (float x, int y)) {
-  return internal::scalbnf(x, y);
+  return __nv_scalbnf(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/sin.cpp b/libc/src/math/nvptx/sin.cpp
index 96e07c99b71a..1bff129a0151 100644
--- a/libc/src/math/gpu/vendor/sin.cpp
+++ b/libc/src/math/nvptx/sin.cpp
@@ -9,10 +9,10 @@
 #include "src/math/sin.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, sin, (double x)) { return internal::sin(x); }
+LLVM_LIBC_FUNCTION(double, sin, (double x)) { return __nv_sin(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/sincos.cpp b/libc/src/math/nvptx/sincos.cpp
index d882157b0bc8..73f92cfb7c34 100644
--- a/libc/src/math/gpu/vendor/sincos.cpp
+++ b/libc/src/math/nvptx/sincos.cpp
@@ -9,12 +9,12 @@
 #include "src/math/sincos.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(void, sincos, (double x, double *sinptr, double *cosptr)) {
-  return internal::sincos(x, sinptr, cosptr);
+  return __nv_sincos(x, sinptr, cosptr);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/sincosf.cpp b/libc/src/math/nvptx/sincosf.cpp
new file mode 100644
index 000000000000..d053aa38151b
--- /dev/null
+++ b/libc/src/math/nvptx/sincosf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the sincosf function for GPU --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sincosf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(void, sincosf, (float x, float *sinptr, float *cosptr)) {
+  return __nv_sincosf(x, sinptr, cosptr);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/sinf.cpp b/libc/src/math/nvptx/sinf.cpp
index af93227ab63b..9abd5cb4d5c6 100644
--- a/libc/src/math/gpu/vendor/sinf.cpp
+++ b/libc/src/math/nvptx/sinf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/sinf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, sinf, (float x)) { return internal::sinf(x); }
+LLVM_LIBC_FUNCTION(float, sinf, (float x)) { return __nv_sinf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/sinh.cpp b/libc/src/math/nvptx/sinh.cpp
index be6b3ae2e2fa..dc6a1e16c634 100644
--- a/libc/src/math/gpu/vendor/sinh.cpp
+++ b/libc/src/math/nvptx/sinh.cpp
@@ -9,10 +9,10 @@
 #include "src/math/sinh.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, sinh, (double x)) { return internal::sinh(x); }
+LLVM_LIBC_FUNCTION(double, sinh, (double x)) { return __nv_sinh(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/sinhf.cpp b/libc/src/math/nvptx/sinhf.cpp
index 99c399b62b7f..c9ab470ed823 100644
--- a/libc/src/math/gpu/vendor/sinhf.cpp
+++ b/libc/src/math/nvptx/sinhf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/sinhf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, sinhf, (float x)) { return internal::sinhf(x); }
+LLVM_LIBC_FUNCTION(float, sinhf, (float x)) { return __nv_sinhf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/sqrt.cpp b/libc/src/math/nvptx/sqrt.cpp
new file mode 100644
index 000000000000..60ca5af4987b
--- /dev/null
+++ b/libc/src/math/nvptx/sqrt.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the GPU sqrt function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sqrt.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, sqrt, (double x)) { return __builtin_sqrt(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/sqrtf.cpp b/libc/src/math/nvptx/sqrtf.cpp
new file mode 100644
index 000000000000..e17f942a4d5f
--- /dev/null
+++ b/libc/src/math/nvptx/sqrtf.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the GPU sqrtf function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sqrtf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, sqrtf, (float x)) { return __builtin_sqrtf(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/tan.cpp b/libc/src/math/nvptx/tan.cpp
index 9a1bd9c89fcf..deb03dca250a 100644
--- a/libc/src/math/gpu/vendor/tan.cpp
+++ b/libc/src/math/nvptx/tan.cpp
@@ -9,10 +9,10 @@
 #include "src/math/tan.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, tan, (double x)) { return internal::tan(x); }
+LLVM_LIBC_FUNCTION(double, tan, (double x)) { return __nv_tan(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/tanf.cpp b/libc/src/math/nvptx/tanf.cpp
index a5266a8c154c..5739e4a1624d 100644
--- a/libc/src/math/gpu/vendor/tanf.cpp
+++ b/libc/src/math/nvptx/tanf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/tanf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, tanf, (float x)) { return internal::tanf(x); }
+LLVM_LIBC_FUNCTION(float, tanf, (float x)) { return __nv_tanf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/tanh.cpp b/libc/src/math/nvptx/tanh.cpp
index 57d764f1f2a0..eabee2cbaf06 100644
--- a/libc/src/math/gpu/vendor/tanh.cpp
+++ b/libc/src/math/nvptx/tanh.cpp
@@ -9,10 +9,10 @@
 #include "src/math/tanh.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, tanh, (double x)) { return internal::tanh(x); }
+LLVM_LIBC_FUNCTION(double, tanh, (double x)) { return __nv_tanh(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/tanhf.cpp b/libc/src/math/nvptx/tanhf.cpp
index 1c9c2f3843a9..582424cb9490 100644
--- a/libc/src/math/gpu/vendor/tanhf.cpp
+++ b/libc/src/math/nvptx/tanhf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/tanhf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, tanhf, (float x)) { return internal::tanhf(x); }
+LLVM_LIBC_FUNCTION(float, tanhf, (float x)) { return __nv_tanhf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/tgamma.cpp b/libc/src/math/nvptx/tgamma.cpp
index e86116a2b0ab..f92193831f9b 100644
--- a/libc/src/math/gpu/vendor/tgamma.cpp
+++ b/libc/src/math/nvptx/tgamma.cpp
@@ -9,10 +9,10 @@
 #include "src/math/tgamma.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, tgamma, (double x)) { return internal::tgamma(x); }
+LLVM_LIBC_FUNCTION(double, tgamma, (double x)) { return __nv_tgamma(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/tgammaf.cpp b/libc/src/math/nvptx/tgammaf.cpp
index 552919bae446..833994455d57 100644
--- a/libc/src/math/gpu/vendor/tgammaf.cpp
+++ b/libc/src/math/nvptx/tgammaf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/tgammaf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, tgammaf, (float x)) { return internal::tgammaf(x); }
+LLVM_LIBC_FUNCTION(float, tgammaf, (float x)) { return __nv_tgammaf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/trunc.cpp b/libc/src/math/nvptx/trunc.cpp
new file mode 100644
index 000000000000..773600f0f250
--- /dev/null
+++ b/libc/src/math/nvptx/trunc.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the GPU trunc function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/trunc.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, trunc, (double x)) { return __builtin_trunc(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/truncf.cpp b/libc/src/math/nvptx/truncf.cpp
new file mode 100644
index 000000000000..534797a3e586
--- /dev/null
+++ b/libc/src/math/nvptx/truncf.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the GPU truncf function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/truncf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, truncf, (float x)) { return __builtin_truncf(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/CMakeLists.txt b/libc/src/stdbit/CMakeLists.txt
index 14cc26e206e0..8bc7dd7852bb 100644
--- a/libc/src/stdbit/CMakeLists.txt
+++ b/libc/src/stdbit/CMakeLists.txt
@@ -7,6 +7,9 @@ set(prefixes
   first_leading_one
   first_trailing_zero
   first_trailing_one
+  count_zeros
+  count_ones
+  has_single_bit
 )
 set(suffixes c s i l ll)
 foreach(prefix IN LISTS prefixes)
diff --git a/libc/src/stdbit/stdc_count_ones_uc.cpp b/libc/src/stdbit/stdc_count_ones_uc.cpp
new file mode 100644
index 000000000000..5a7314caa3ba
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_ones_uc.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_count_ones_uc ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_count_ones_uc.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_count_ones_uc, (unsigned char value)) {
+  return static_cast<unsigned>(cpp::count_ones(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_count_ones_uc.h b/libc/src/stdbit/stdc_count_ones_uc.h
new file mode 100644
index 000000000000..eed3ee5f181b
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_ones_uc.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_count_ones_uc -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_UC_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_UC_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_count_ones_uc(unsigned char value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_UC_H
diff --git a/libc/src/stdbit/stdc_count_ones_ui.cpp b/libc/src/stdbit/stdc_count_ones_ui.cpp
new file mode 100644
index 000000000000..289f4bac31f7
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_ones_ui.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_count_ones_ui ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_count_ones_ui.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_count_ones_ui, (unsigned value)) {
+  return static_cast<unsigned>(cpp::count_ones(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_count_ones_ui.h b/libc/src/stdbit/stdc_count_ones_ui.h
new file mode 100644
index 000000000000..1f7ccb9c502f
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_ones_ui.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_count_ones_ui -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_UI_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_UI_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_count_ones_ui(unsigned value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_UI_H
diff --git a/libc/src/stdbit/stdc_count_ones_ul.cpp b/libc/src/stdbit/stdc_count_ones_ul.cpp
new file mode 100644
index 000000000000..83f3279d7919
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_ones_ul.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_count_ones_ul ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_count_ones_ul.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_count_ones_ul, (unsigned long value)) {
+  return static_cast<unsigned>(cpp::count_ones(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_count_ones_ul.h b/libc/src/stdbit/stdc_count_ones_ul.h
new file mode 100644
index 000000000000..bde349a2fb94
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_ones_ul.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_count_ones_ul -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_UL_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_UL_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_count_ones_ul(unsigned long value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_UL_H
diff --git a/libc/src/stdbit/stdc_count_ones_ull.cpp b/libc/src/stdbit/stdc_count_ones_ull.cpp
new file mode 100644
index 000000000000..104788aaf212
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_ones_ull.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_count_ones_ull -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_count_ones_ull.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_count_ones_ull, (unsigned long long value)) {
+  return static_cast<unsigned>(cpp::count_ones(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_count_ones_ull.h b/libc/src/stdbit/stdc_count_ones_ull.h
new file mode 100644
index 000000000000..830239f8874b
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_ones_ull.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_count_ones_ull -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_ULL_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_ULL_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_count_ones_ull(unsigned long long value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_ULL_H
diff --git a/libc/src/stdbit/stdc_count_ones_us.cpp b/libc/src/stdbit/stdc_count_ones_us.cpp
new file mode 100644
index 000000000000..4b6ff0b94b62
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_ones_us.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_count_ones_us ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_count_ones_us.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_count_ones_us, (unsigned short value)) {
+  return static_cast<unsigned>(cpp::count_ones(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_count_ones_us.h b/libc/src/stdbit/stdc_count_ones_us.h
new file mode 100644
index 000000000000..08fd4e76eaae
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_ones_us.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_count_ones_us -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_US_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_US_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_count_ones_us(unsigned short value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_US_H
diff --git a/libc/src/stdbit/stdc_count_zeros_uc.cpp b/libc/src/stdbit/stdc_count_zeros_uc.cpp
new file mode 100644
index 000000000000..22c57bd60c38
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_zeros_uc.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_count_zeros_uc -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_count_zeros_uc.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_count_zeros_uc, (unsigned char value)) {
+  return static_cast<unsigned>(cpp::count_zeros(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_count_zeros_uc.h b/libc/src/stdbit/stdc_count_zeros_uc.h
new file mode 100644
index 000000000000..34b4636ee3f9
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_zeros_uc.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_count_zeros_uc -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_UC_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_UC_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_count_zeros_uc(unsigned char value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_UC_H
diff --git a/libc/src/stdbit/stdc_count_zeros_ui.cpp b/libc/src/stdbit/stdc_count_zeros_ui.cpp
new file mode 100644
index 000000000000..6a1defd9d555
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_zeros_ui.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_count_zeros_ui -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_count_zeros_ui.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_count_zeros_ui, (unsigned value)) {
+  return static_cast<unsigned>(cpp::count_zeros(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_count_zeros_ui.h b/libc/src/stdbit/stdc_count_zeros_ui.h
new file mode 100644
index 000000000000..48e8630f6f09
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_zeros_ui.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_count_zeros_ui -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_UI_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_UI_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_count_zeros_ui(unsigned value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_UI_H
diff --git a/libc/src/stdbit/stdc_count_zeros_ul.cpp b/libc/src/stdbit/stdc_count_zeros_ul.cpp
new file mode 100644
index 000000000000..ceab32ef9ac3
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_zeros_ul.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_count_zeros_ul -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_count_zeros_ul.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_count_zeros_ul, (unsigned long value)) {
+  return static_cast<unsigned>(cpp::count_zeros(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_count_zeros_ul.h b/libc/src/stdbit/stdc_count_zeros_ul.h
new file mode 100644
index 000000000000..b88387741ade
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_zeros_ul.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_count_zeros_ul -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_UL_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_UL_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_count_zeros_ul(unsigned long value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_UL_H
diff --git a/libc/src/stdbit/stdc_count_zeros_ull.cpp b/libc/src/stdbit/stdc_count_zeros_ull.cpp
new file mode 100644
index 000000000000..2f57f727a691
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_zeros_ull.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_count_zeros_ull ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_count_zeros_ull.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_count_zeros_ull, (unsigned long long value)) {
+  return static_cast<unsigned>(cpp::count_zeros(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_count_zeros_ull.h b/libc/src/stdbit/stdc_count_zeros_ull.h
new file mode 100644
index 000000000000..e15b33011ab7
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_zeros_ull.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_count_zeros_ull ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_ULL_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_ULL_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_count_zeros_ull(unsigned long long value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_ULL_H
diff --git a/libc/src/stdbit/stdc_count_zeros_us.cpp b/libc/src/stdbit/stdc_count_zeros_us.cpp
new file mode 100644
index 000000000000..fc06836ee292
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_zeros_us.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_count_zeros_us -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_count_zeros_us.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_count_zeros_us, (unsigned short value)) {
+  return static_cast<unsigned>(cpp::count_zeros(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_count_zeros_us.h b/libc/src/stdbit/stdc_count_zeros_us.h
new file mode 100644
index 000000000000..d422377f076b
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_zeros_us.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_count_zeros_us -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_US_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_US_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_count_zeros_us(unsigned short value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_US_H
diff --git a/libc/src/stdbit/stdc_has_single_bit_uc.cpp b/libc/src/stdbit/stdc_has_single_bit_uc.cpp
new file mode 100644
index 000000000000..e5acdc2a71b4
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_uc.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_has_single_bit_uc --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_has_single_bit_uc.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(bool, stdc_has_single_bit_uc, (unsigned char value)) {
+  return cpp::has_single_bit(value);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_has_single_bit_uc.h b/libc/src/stdbit/stdc_has_single_bit_uc.h
new file mode 100644
index 000000000000..028d4ee71050
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_uc.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_has_single_bit_uc --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UC_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UC_H
+
+namespace LIBC_NAMESPACE {
+
+bool stdc_has_single_bit_uc(unsigned char value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UC_H
diff --git a/libc/src/stdbit/stdc_has_single_bit_ui.cpp b/libc/src/stdbit/stdc_has_single_bit_ui.cpp
new file mode 100644
index 000000000000..37578882324a
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_ui.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_has_single_bit_ui --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_has_single_bit_ui.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(bool, stdc_has_single_bit_ui, (unsigned value)) {
+  return cpp::has_single_bit(value);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_has_single_bit_ui.h b/libc/src/stdbit/stdc_has_single_bit_ui.h
new file mode 100644
index 000000000000..1e8cd9afaee8
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_ui.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_has_single_bit_ui --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UI_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UI_H
+
+namespace LIBC_NAMESPACE {
+
+bool stdc_has_single_bit_ui(unsigned value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UI_H
diff --git a/libc/src/stdbit/stdc_has_single_bit_ul.cpp b/libc/src/stdbit/stdc_has_single_bit_ul.cpp
new file mode 100644
index 000000000000..85133ab81cc6
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_ul.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_has_single_bit_ul --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_has_single_bit_ul.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(bool, stdc_has_single_bit_ul, (unsigned long value)) {
+  return cpp::has_single_bit(value);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_has_single_bit_ul.h b/libc/src/stdbit/stdc_has_single_bit_ul.h
new file mode 100644
index 000000000000..9b924fca9f06
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_ul.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_has_single_bit_ul --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UL_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UL_H
+
+namespace LIBC_NAMESPACE {
+
+bool stdc_has_single_bit_ul(unsigned long value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UL_H
diff --git a/libc/src/stdbit/stdc_has_single_bit_ull.cpp b/libc/src/stdbit/stdc_has_single_bit_ull.cpp
new file mode 100644
index 000000000000..4491cf2b98b6
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_ull.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_has_single_bit_ull -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_has_single_bit_ull.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(bool, stdc_has_single_bit_ull, (unsigned long long value)) {
+  return cpp::has_single_bit(value);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_has_single_bit_ull.h b/libc/src/stdbit/stdc_has_single_bit_ull.h
new file mode 100644
index 000000000000..d4802bc28727
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_ull.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_has_single_bit_ull -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_ULL_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_ULL_H
+
+namespace LIBC_NAMESPACE {
+
+bool stdc_has_single_bit_ull(unsigned long long value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_ULL_H
diff --git a/libc/src/stdbit/stdc_has_single_bit_us.cpp b/libc/src/stdbit/stdc_has_single_bit_us.cpp
new file mode 100644
index 000000000000..7a42ae553aa2
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_us.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_has_single_bit_us --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_has_single_bit_us.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(bool, stdc_has_single_bit_us, (unsigned short value)) {
+  return cpp::has_single_bit(value);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_has_single_bit_us.h b/libc/src/stdbit/stdc_has_single_bit_us.h
new file mode 100644
index 000000000000..201ff4954c3b
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_us.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_has_single_bit_us --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_US_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_US_H
+
+namespace LIBC_NAMESPACE {
+
+bool stdc_has_single_bit_us(unsigned short value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_US_H
diff --git a/libc/src/stdfix/CMakeLists.txt b/libc/src/stdfix/CMakeLists.txt
index 6e2ed1bdfeaf..cb2134fe33cf 100644
--- a/libc/src/stdfix/CMakeLists.txt
+++ b/libc/src/stdfix/CMakeLists.txt
@@ -17,6 +17,21 @@ foreach(suffix IN ITEMS hr r lr hk k lk)
   )
 endforeach()
 
+foreach(suffix IN ITEMS uhr ur ulr uhk uk)
+  add_entrypoint_object(
+    sqrt${suffix}
+    HDRS
+      sqrt${suffix}.h
+    SRCS
+      sqrt${suffix}.cpp
+    COMPILE_OPTIONS
+      -O3
+      -ffixed-point
+    DEPENDS
+      libc.src.__support.fixed_point.sqrt
+  )
+endforeach()
+
 foreach(suffix IN ITEMS hr r lr hk k lk uhr ur ulr uhk uk ulk)
   add_entrypoint_object(
     round${suffix}
diff --git a/libc/src/stdfix/abshk.h b/libc/src/stdfix/abshk.h
index 13c9300caab8..80dc73053dfb 100644
--- a/libc/src/stdfix/abshk.h
+++ b/libc/src/stdfix/abshk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSHK_H
 #define LLVM_LIBC_SRC_STDFIX_ABSHK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/abshr.h b/libc/src/stdfix/abshr.h
index 5acd0cfc4a60..035f9a6de222 100644
--- a/libc/src/stdfix/abshr.h
+++ b/libc/src/stdfix/abshr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSHR_H
 #define LLVM_LIBC_SRC_STDFIX_ABSHR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/absk.h b/libc/src/stdfix/absk.h
index 73dfcac0ac8e..426415de28e6 100644
--- a/libc/src/stdfix/absk.h
+++ b/libc/src/stdfix/absk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSK_H
 #define LLVM_LIBC_SRC_STDFIX_ABSK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/abslk.h b/libc/src/stdfix/abslk.h
index 7de116fa2279..21e33f856bfc 100644
--- a/libc/src/stdfix/abslk.h
+++ b/libc/src/stdfix/abslk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSLK_H
 #define LLVM_LIBC_SRC_STDFIX_ABSLK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/abslr.h b/libc/src/stdfix/abslr.h
index bf5b585bbbb6..ebca35e58aa5 100644
--- a/libc/src/stdfix/abslr.h
+++ b/libc/src/stdfix/abslr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSLR_H
 #define LLVM_LIBC_SRC_STDFIX_ABSLR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/absr.h b/libc/src/stdfix/absr.h
index b5ead7ce14e2..2744fcb5a7ec 100644
--- a/libc/src/stdfix/absr.h
+++ b/libc/src/stdfix/absr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSR_H
 #define LLVM_LIBC_SRC_STDFIX_ABSR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundhk.h b/libc/src/stdfix/roundhk.h
index 9a5c874cc030..06de5cc05cdb 100644
--- a/libc/src/stdfix/roundhk.h
+++ b/libc/src/stdfix/roundhk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDHK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDHK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundhr.h b/libc/src/stdfix/roundhr.h
index ba5a67945d6c..6729bf5b1399 100644
--- a/libc/src/stdfix/roundhr.h
+++ b/libc/src/stdfix/roundhr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDHR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDHR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundk.h b/libc/src/stdfix/roundk.h
index e9fa6d8f9c3b..02fb9a8c9b1a 100644
--- a/libc/src/stdfix/roundk.h
+++ b/libc/src/stdfix/roundk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundlk.h b/libc/src/stdfix/roundlk.h
index 5fa0e90e855a..28be9c005494 100644
--- a/libc/src/stdfix/roundlk.h
+++ b/libc/src/stdfix/roundlk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDLK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDLK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundlr.h b/libc/src/stdfix/roundlr.h
index c015292e8f3f..be97a35a6420 100644
--- a/libc/src/stdfix/roundlr.h
+++ b/libc/src/stdfix/roundlr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDLR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDLR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundr.h b/libc/src/stdfix/roundr.h
index b5b1375c882e..15523f8b6c9a 100644
--- a/libc/src/stdfix/roundr.h
+++ b/libc/src/stdfix/roundr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/rounduhk.h b/libc/src/stdfix/rounduhk.h
index 85ebf2903ec7..d1c4a4416d76 100644
--- a/libc/src/stdfix/rounduhk.h
+++ b/libc/src/stdfix/rounduhk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDUHK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDUHK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/rounduhr.h b/libc/src/stdfix/rounduhr.h
index 1be0aab1f5a7..6cecb733dd3b 100644
--- a/libc/src/stdfix/rounduhr.h
+++ b/libc/src/stdfix/rounduhr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDUHR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDUHR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/rounduk.h b/libc/src/stdfix/rounduk.h
index 8dae89586c49..4511d69525c5 100644
--- a/libc/src/stdfix/rounduk.h
+++ b/libc/src/stdfix/rounduk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDUK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDUK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundulk.h b/libc/src/stdfix/roundulk.h
index 81dfd1dceb60..8bd90beeb830 100644
--- a/libc/src/stdfix/roundulk.h
+++ b/libc/src/stdfix/roundulk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDULK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDULK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundulr.h b/libc/src/stdfix/roundulr.h
index 002fc94907c6..65e5c27b1c85 100644
--- a/libc/src/stdfix/roundulr.h
+++ b/libc/src/stdfix/roundulr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDULR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDULR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundur.h b/libc/src/stdfix/roundur.h
index 72de44b1e0c4..110e578da793 100644
--- a/libc/src/stdfix/roundur.h
+++ b/libc/src/stdfix/roundur.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDUR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDUR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/sqrtuhk.cpp b/libc/src/stdfix/sqrtuhk.cpp
new file mode 100644
index 000000000000..e8dc842c8a99
--- /dev/null
+++ b/libc/src/stdfix/sqrtuhk.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of sqrtuhk function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sqrtuhk.h"
+#include "src/__support/common.h"
+#include "src/__support/fixed_point/sqrt.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned short accum, sqrtuhk, (unsigned short accum x)) {
+  return fixed_point::sqrt(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdfix/sqrtuhk.h b/libc/src/stdfix/sqrtuhk.h
new file mode 100644
index 000000000000..b57340003fa0
--- /dev/null
+++ b/libc/src/stdfix/sqrtuhk.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for sqrtuhk -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDFIX_SQRTUHK_H
+#define LLVM_LIBC_SRC_STDFIX_SQRTUHK_H
+
+#include "llvm-libc-macros/stdfix-macros.h"
+
+namespace LIBC_NAMESPACE {
+
+unsigned short accum sqrtuhk(unsigned short accum x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDFIX_SQRTUHK_H
diff --git a/libc/src/stdfix/sqrtuhr.cpp b/libc/src/stdfix/sqrtuhr.cpp
new file mode 100644
index 000000000000..6bba07aa20d5
--- /dev/null
+++ b/libc/src/stdfix/sqrtuhr.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of sqrtuhr function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sqrtuhr.h"
+#include "src/__support/common.h"
+#include "src/__support/fixed_point/sqrt.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned short fract, sqrtuhr, (unsigned short fract x)) {
+  return fixed_point::sqrt(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdfix/sqrtuhr.h b/libc/src/stdfix/sqrtuhr.h
new file mode 100644
index 000000000000..6b629a29de3c
--- /dev/null
+++ b/libc/src/stdfix/sqrtuhr.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for sqrtuhr -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDFIX_SQRTUHR_H
+#define LLVM_LIBC_SRC_STDFIX_SQRTUHR_H
+
+#include "llvm-libc-macros/stdfix-macros.h"
+
+namespace LIBC_NAMESPACE {
+
+unsigned short fract sqrtuhr(unsigned short fract x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDFIX_SQRTUHR_H
diff --git a/libc/src/stdfix/sqrtuk.cpp b/libc/src/stdfix/sqrtuk.cpp
new file mode 100644
index 000000000000..6e5d8118c83b
--- /dev/null
+++ b/libc/src/stdfix/sqrtuk.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of sqrtuk function ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sqrtuk.h"
+#include "src/__support/common.h"
+#include "src/__support/fixed_point/sqrt.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned accum, sqrtuk, (unsigned accum x)) {
+  return fixed_point::sqrt(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdfix/sqrtuk.h b/libc/src/stdfix/sqrtuk.h
new file mode 100644
index 000000000000..6bd7a2608716
--- /dev/null
+++ b/libc/src/stdfix/sqrtuk.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for sqrtuk ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDFIX_SQRTUK_H
+#define LLVM_LIBC_SRC_STDFIX_SQRTUK_H
+
+#include "llvm-libc-macros/stdfix-macros.h"
+
+namespace LIBC_NAMESPACE {
+
+unsigned accum sqrtuk(unsigned accum x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDFIX_SQRTUK_H
diff --git a/libc/src/stdfix/sqrtulr.cpp b/libc/src/stdfix/sqrtulr.cpp
new file mode 100644
index 000000000000..c9e5cd51f66b
--- /dev/null
+++ b/libc/src/stdfix/sqrtulr.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of sqrtulr function  -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sqrtulr.h"
+#include "src/__support/common.h"
+#include "src/__support/fixed_point/sqrt.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned long fract, sqrtulr, (unsigned long fract x)) {
+  return fixed_point::sqrt(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdfix/sqrtulr.h b/libc/src/stdfix/sqrtulr.h
new file mode 100644
index 000000000000..d1982a6b1c05
--- /dev/null
+++ b/libc/src/stdfix/sqrtulr.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for sqrtulr -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDFIX_SQRTULR_H
+#define LLVM_LIBC_SRC_STDFIX_SQRTULR_H
+
+#include "llvm-libc-macros/stdfix-macros.h"
+
+namespace LIBC_NAMESPACE {
+
+unsigned long fract sqrtulr(unsigned long fract x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDFIX_SQRTULR_H
diff --git a/libc/src/stdfix/sqrtur.cpp b/libc/src/stdfix/sqrtur.cpp
new file mode 100644
index 000000000000..ac5be8491084
--- /dev/null
+++ b/libc/src/stdfix/sqrtur.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of sqrtur function ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sqrtur.h"
+#include "src/__support/common.h"
+#include "src/__support/fixed_point/sqrt.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned fract, sqrtur, (unsigned fract x)) {
+  return fixed_point::sqrt(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdfix/sqrtur.h b/libc/src/stdfix/sqrtur.h
new file mode 100644
index 000000000000..13f7d1e5e466
--- /dev/null
+++ b/libc/src/stdfix/sqrtur.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for sqrtur ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDFIX_SQRTUR_H
+#define LLVM_LIBC_SRC_STDFIX_SQRTUR_H
+
+#include "llvm-libc-macros/stdfix-macros.h"
+
+namespace LIBC_NAMESPACE {
+
+unsigned fract sqrtur(unsigned fract x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDFIX_SQRTUR_H
diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt
index 9b81a7abb6cf..02819ea25ea0 100644
--- a/libc/src/stdio/printf_core/CMakeLists.txt
+++ b/libc/src/stdio/printf_core/CMakeLists.txt
@@ -10,6 +10,9 @@ endif()
 if(LIBC_CONF_PRINTF_FLOAT_TO_STR_USE_MEGA_LONG_DOUBLE_TABLE)
   list(APPEND printf_config_copts "-DLIBC_COPT_FLOAT_TO_STR_USE_MEGA_LONG_DOUBLE_TABLE")
 endif()
+if(LIBC_CONF_PRINTF_DISABLE_FIXED_POINT)
+  list(APPEND printf_config_copts "-DLIBC_COPT_PRINTF_DISABLE_FIXED_POINT")
+endif()
 if(printf_config_copts)
   list(PREPEND printf_config_copts "COMPILE_OPTIONS")
 endif()
@@ -76,6 +79,7 @@ add_object_library(
     float_inf_nan_converter.h
     float_hex_converter.h
     float_dec_converter.h
+    fixed_converter.h #TODO: Check if this should be disabled when fixed unavail
   DEPENDS
     .writer
     .core_structs
diff --git a/libc/src/stdio/printf_core/converter.cpp b/libc/src/stdio/printf_core/converter.cpp
index 52412aef3c5c..613d693c3cfc 100644
--- a/libc/src/stdio/printf_core/converter.cpp
+++ b/libc/src/stdio/printf_core/converter.cpp
@@ -9,6 +9,7 @@
 #include "src/stdio/printf_core/converter.h"
 
 #include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/printf_config.h"
 #include "src/stdio/printf_core/writer.h"
 
 // This option allows for replacing all of the conversion functions with custom
@@ -75,6 +76,13 @@ int convert(Writer *writer, const FormatSection &to_conv) {
   case 'G':
     return convert_float_dec_auto(writer, to_conv);
 #endif // LIBC_COPT_PRINTF_DISABLE_FLOAT
+#ifdef LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+  case 'r':
+  case 'R':
+  case 'k':
+  case 'K':
+    return convert_fixed(writer, to_conv);
+#endif // LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
 #ifndef LIBC_COPT_PRINTF_DISABLE_WRITE_INT
   case 'n':
     return convert_write_int(writer, to_conv);
diff --git a/libc/src/stdio/printf_core/converter_atlas.h b/libc/src/stdio/printf_core/converter_atlas.h
index 6471f3f2955b..2189ed11a551 100644
--- a/libc/src/stdio/printf_core/converter_atlas.h
+++ b/libc/src/stdio/printf_core/converter_atlas.h
@@ -31,6 +31,11 @@
 #include "src/stdio/printf_core/float_hex_converter.h"
 #endif // LIBC_COPT_PRINTF_DISABLE_FLOAT
 
+#ifdef LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+// defines convert_fixed
+#include "src/stdio/printf_core/fixed_converter.h"
+#endif // LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+
 #ifndef LIBC_COPT_PRINTF_DISABLE_WRITE_INT
 #include "src/stdio/printf_core/write_int_converter.h"
 #endif // LIBC_COPT_PRINTF_DISABLE_WRITE_INT
diff --git a/libc/src/stdio/printf_core/converter_utils.h b/libc/src/stdio/printf_core/converter_utils.h
index 54f0a870d0ac..948fe816e9b7 100644
--- a/libc/src/stdio/printf_core/converter_utils.h
+++ b/libc/src/stdio/printf_core/converter_utils.h
@@ -51,6 +51,9 @@ LIBC_INLINE uintmax_t apply_length_modifier(uintmax_t num, LengthModifier lm) {
       return result;                                                           \
   }
 
+// This is used to represent which direction the number should be rounded.
+enum class RoundDirection { Up, Down, Even };
+
 } // namespace printf_core
 } // namespace LIBC_NAMESPACE
 
diff --git a/libc/src/stdio/printf_core/core_structs.h b/libc/src/stdio/printf_core/core_structs.h
index 7634d45568ab..d3718b49d1b1 100644
--- a/libc/src/stdio/printf_core/core_structs.h
+++ b/libc/src/stdio/printf_core/core_structs.h
@@ -10,7 +10,9 @@
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_CORE_STRUCTS_H
 
 #include "src/__support/CPP/string_view.h"
+#include "src/__support/CPP/type_traits.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/stdio/printf_core/printf_config.h"
 
 #include <inttypes.h>
 #include <stddef.h>
@@ -77,7 +79,13 @@ struct FormatSection {
   }
 };
 
-enum PrimaryType : uint8_t { Unknown = 0, Float = 1, Pointer = 2, Integer = 3 };
+enum PrimaryType : uint8_t {
+  Unknown = 0,
+  Float = 1,
+  Pointer = 2,
+  Integer = 3,
+  FixedPoint = 4,
+};
 
 // TypeDesc stores the information about a type that is relevant to printf in
 // a relatively compact manner.
@@ -95,9 +103,16 @@ template <typename T> LIBC_INLINE constexpr TypeDesc type_desc_from_type() {
   } else {
     constexpr bool IS_POINTER = cpp::is_pointer_v<T>;
     constexpr bool IS_FLOAT = cpp::is_floating_point_v<T>;
-    return TypeDesc{sizeof(T), IS_POINTER ? PrimaryType::Pointer
-                               : IS_FLOAT ? PrimaryType::Float
-                                          : PrimaryType::Integer};
+#ifdef LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+    constexpr bool IS_FIXED_POINT = cpp::is_fixed_point_v<T>;
+#else
+    constexpr bool IS_FIXED_POINT = false;
+#endif // LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+
+    return TypeDesc{sizeof(T), IS_POINTER       ? PrimaryType::Pointer
+                               : IS_FLOAT       ? PrimaryType::Float
+                               : IS_FIXED_POINT ? PrimaryType::FixedPoint
+                                                : PrimaryType::Integer};
   }
 }
 
@@ -109,6 +124,7 @@ constexpr int FILE_WRITE_ERROR = -1;
 constexpr int FILE_STATUS_ERROR = -2;
 constexpr int NULLPTR_WRITE_ERROR = -3;
 constexpr int INT_CONVERSION_ERROR = -4;
+constexpr int FIXED_POINT_CONVERSION_ERROR = -5;
 
 } // namespace printf_core
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdio/printf_core/fixed_converter.h b/libc/src/stdio/printf_core/fixed_converter.h
new file mode 100644
index 000000000000..c89971e20686
--- /dev/null
+++ b/libc/src/stdio/printf_core/fixed_converter.h
@@ -0,0 +1,309 @@
+//===-- Fixed Point Converter for printf ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_FIXED_CONVERTER_H
+#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_FIXED_CONVERTER_H
+
+#include "llvm-libc-macros/stdfix-macros.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/fixed_point/fx_bits.h"
+#include "src/__support/fixed_point/fx_rep.h"
+#include "src/__support/integer_to_string.h"
+#include "src/__support/libc_assert.h"
+#include "src/stdio/printf_core/converter_utils.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/writer.h"
+
+#include <inttypes.h>
+#include <stddef.h>
+
+namespace LIBC_NAMESPACE {
+namespace printf_core {
+
+// This is just for assertions. It will be compiled out for release builds.
+LIBC_INLINE constexpr uint32_t const_ten_exp(uint32_t exponent) {
+  uint32_t result = 1;
+  LIBC_ASSERT(exponent < 11);
+  for (uint32_t i = 0; i < exponent; ++i)
+    result *= 10;
+
+  return result;
+}
+
+#define READ_FX_BITS(TYPE)                                                     \
+  do {                                                                         \
+    auto fixed_bits = fixed_point::FXBits<TYPE>(                               \
+        fixed_point::FXRep<TYPE>::StorageType(to_conv.conv_val_raw));          \
+    integral = fixed_bits.get_integral();                                      \
+    fractional = fixed_bits.get_fraction();                                    \
+    exponent = fixed_bits.get_exponent();                                      \
+    is_negative = fixed_bits.get_sign();                                       \
+  } while (false)
+
+#define APPLY_FX_LENGTH_MODIFIER(LENGTH_MODIFIER)                              \
+  do {                                                                         \
+    if (to_conv.conv_name == 'r') {                                            \
+      READ_FX_BITS(LENGTH_MODIFIER fract);                                     \
+    } else if (to_conv.conv_name == 'R') {                                     \
+      READ_FX_BITS(unsigned LENGTH_MODIFIER fract);                            \
+    } else if (to_conv.conv_name == 'k') {                                     \
+      READ_FX_BITS(LENGTH_MODIFIER accum);                                     \
+    } else if (to_conv.conv_name == 'K') {                                     \
+      READ_FX_BITS(unsigned LENGTH_MODIFIER accum);                            \
+    } else {                                                                   \
+      LIBC_ASSERT(false && "Invalid conversion name passed to convert_fixed"); \
+      return FIXED_POINT_CONVERSION_ERROR;                                     \
+    }                                                                          \
+  } while (false)
+
+LIBC_INLINE int convert_fixed(Writer *writer, const FormatSection &to_conv) {
+  // Long accum should be the largest type, so we can store all the smaller
+  // numbers in things sized for it.
+  using LARep = fixed_point::FXRep<unsigned long accum>;
+  using StorageType = LARep::StorageType;
+
+  // All of the letters will be defined relative to variable a, which will be
+  // the appropriate case based on the name of the conversion. This converts any
+  // conversion name into the letter 'a' with the appropriate case.
+  const char a = (to_conv.conv_name & 32) | 'A';
+  FormatFlags flags = to_conv.flags;
+
+  bool is_negative;
+  int exponent;
+  StorageType integral;
+  StorageType fractional;
+
+  // r = fract
+  // k = accum
+  // lowercase = signed
+  // uppercase = unsigned
+  // h = short
+  // l = long
+  // any other length modifier has no effect
+
+  if (to_conv.length_modifier == LengthModifier::h) {
+    APPLY_FX_LENGTH_MODIFIER(short);
+  } else if (to_conv.length_modifier == LengthModifier::l) {
+    APPLY_FX_LENGTH_MODIFIER(long);
+  } else {
+    APPLY_FX_LENGTH_MODIFIER();
+  }
+
+  LIBC_ASSERT(static_cast<size_t>(exponent) <=
+                  (sizeof(StorageType) - sizeof(uint32_t)) * CHAR_BIT &&
+              "StorageType must be large enough to hold the fractional "
+              "component multiplied by a 32 bit number.");
+
+  // If to_conv doesn't specify a precision, the precision defaults to 6.
+  const size_t precision = to_conv.precision < 0 ? 6 : to_conv.precision;
+  bool has_decimal_point =
+      (precision > 0) || ((flags & FormatFlags::ALTERNATE_FORM) != 0);
+
+  // The number of non-zero digits below the decimal point for a negative power
+  // of 2 in base 10 is equal to the magnitude of the power of 2.
+
+  // A quick proof:
+  // Let p be any positive integer.
+  // Let e = 2^(-p)
+  // Let t be a positive integer such that e * 10^t is an integer.
+  // By definition: The smallest allowed value of t must be equal to the number
+  // of non-zero digits below the decimal point in e.
+  // If we evaluate e * 10^t we get the following:
+  // e * 10^t = 2^(-p) * 10*t = 2^(-p) * 2^t * 5^t = 5^t * 2^(t-p)
+  // For 5^t * 2^(t-p) to be an integer, both exponents must be non-negative,
+  // since 5 and 2 are coprime.
+  // The smallest value of t such that t-p is non-negative is p.
+  // Therefor, the number of non-zero digits below the decimal point for a given
+  // negative power of 2 "p" is equal to the value of p.
+
+  constexpr size_t MAX_FRACTION_DIGITS = LARep::FRACTION_LEN;
+
+  char fraction_digits[MAX_FRACTION_DIGITS];
+
+  size_t valid_fraction_digits = 0;
+
+  // TODO: Factor this part out
+  while (fractional > 0) {
+    uint32_t cur_digits = 0;
+    // 10^9 is used since it's the largest power of 10 that fits in a uint32_t
+    constexpr uint32_t TEN_EXP_NINE = 1000000000;
+    constexpr size_t DIGITS_PER_BLOCK = 9;
+
+    // Multiply by 10^9, then grab the digits above the decimal point, then
+    // clear those digits in fractional.
+    fractional = fractional * TEN_EXP_NINE;
+    cur_digits = static_cast<uint32_t>(fractional >> exponent);
+    fractional = fractional % (StorageType(1) << exponent);
+
+    // we add TEN_EXP_NINE to force leading zeroes to show up, then we skip the
+    // first digit in the loop.
+    const IntegerToString<uint32_t> cur_fractional_digits(cur_digits +
+                                                          TEN_EXP_NINE);
+    for (size_t i = 0;
+         i < DIGITS_PER_BLOCK && valid_fraction_digits < MAX_FRACTION_DIGITS;
+         ++i, ++valid_fraction_digits)
+      fraction_digits[valid_fraction_digits] =
+          cur_fractional_digits.view()[i + 1];
+
+    if (valid_fraction_digits >= MAX_FRACTION_DIGITS) {
+      LIBC_ASSERT(fractional == 0 && "If the fraction digit buffer is full, "
+                                     "there should be no remaining digits.");
+      /*
+        A visual explanation of what this assert is checking:
+
+         32 digits (max for 32 bit fract)
+         +------------------------------++--+--- must be zero
+         |                              ||  |
+         123456789012345678901234567890120000
+         |       ||       ||       ||       |
+         +-------++-------++-------++-------+
+         9 digit blocks
+      */
+      LIBC_ASSERT(cur_digits % const_ten_exp(
+                                   DIGITS_PER_BLOCK -
+                                   (MAX_FRACTION_DIGITS % DIGITS_PER_BLOCK)) ==
+                      0 &&
+                  "Digits after the MAX_FRACTION_DIGITS should all be zero.");
+      valid_fraction_digits = MAX_FRACTION_DIGITS;
+    }
+  }
+
+  if (precision < valid_fraction_digits) {
+    // Handle rounding. Just do round to nearest, tie to even since it's
+    // unspecified.
+    RoundDirection round;
+    char first_digit_after = fraction_digits[precision];
+    if (first_digit_after > '5') {
+      round = RoundDirection::Up;
+    } else if (first_digit_after < '5') {
+      round = RoundDirection::Down;
+    } else {
+      // first_digit_after == '5'
+      // need to check the remaining digits, but default to even.
+      round = RoundDirection::Even;
+      for (size_t cur_digit_index = precision + 1;
+           cur_digit_index + 1 < valid_fraction_digits; ++cur_digit_index) {
+        if (fraction_digits[cur_digit_index] != '0') {
+          round = RoundDirection::Up;
+          break;
+        }
+      }
+    }
+
+    // If we need to actually perform rounding, do so.
+    if (round == RoundDirection::Up || round == RoundDirection::Even) {
+      bool keep_rounding = true;
+      int digit_to_round = static_cast<int>(precision) - 1;
+      for (; digit_to_round >= 0 && keep_rounding; --digit_to_round) {
+        keep_rounding = false;
+        char cur_digit = fraction_digits[digit_to_round];
+        // if the digit should not be rounded up
+        if (round == RoundDirection::Even && ((cur_digit - '0') % 2) == 0) {
+          // break out of the loop
+          break;
+        }
+        fraction_digits[digit_to_round] += 1;
+
+        // if the digit was a 9, instead replace with a 0.
+        if (cur_digit == '9') {
+          fraction_digits[digit_to_round] = '0';
+          keep_rounding = true;
+        }
+      }
+
+      // if every digit below the decimal point was rounded up but we need to
+      // keep rounding
+      if (keep_rounding &&
+          (round == RoundDirection::Up ||
+           (round == RoundDirection::Even && ((integral % 2) == 1)))) {
+        // add one to the integral portion to round it up.
+        ++integral;
+      }
+    }
+
+    valid_fraction_digits = precision;
+  }
+
+  const IntegerToString<StorageType> integral_str(integral);
+
+  // these are signed to prevent underflow due to negative values. The
+  // eventual values will always be non-negative.
+  size_t trailing_zeroes = 0;
+  int padding;
+
+  // If the precision is greater than the actual result, pad with 0s
+  if (precision > valid_fraction_digits)
+    trailing_zeroes = precision - (valid_fraction_digits);
+
+  constexpr cpp::string_view DECIMAL_POINT(".");
+
+  char sign_char = 0;
+
+  // Check if the conv name is uppercase
+  if (a == 'A') {
+    // These flags are only for signed conversions, so this removes them if the
+    // conversion is unsigned.
+    flags = FormatFlags(flags &
+                        ~(FormatFlags::FORCE_SIGN | FormatFlags::SPACE_PREFIX));
+  }
+
+  if (is_negative)
+    sign_char = '-';
+  else if ((flags & FormatFlags::FORCE_SIGN) == FormatFlags::FORCE_SIGN)
+    sign_char = '+'; // FORCE_SIGN has precedence over SPACE_PREFIX
+  else if ((flags & FormatFlags::SPACE_PREFIX) == FormatFlags::SPACE_PREFIX)
+    sign_char = ' ';
+
+  padding = static_cast<int>(to_conv.min_width - (sign_char > 0 ? 1 : 0) -
+                             integral_str.size() -
+                             static_cast<int>(has_decimal_point) -
+                             valid_fraction_digits - trailing_zeroes);
+  if (padding < 0)
+    padding = 0;
+
+  if ((flags & FormatFlags::LEFT_JUSTIFIED) == FormatFlags::LEFT_JUSTIFIED) {
+    // The pattern is (sign), integral, (.), (fraction), (zeroes), (spaces)
+    if (sign_char > 0)
+      RET_IF_RESULT_NEGATIVE(writer->write(sign_char));
+    RET_IF_RESULT_NEGATIVE(writer->write(integral_str.view()));
+    if (has_decimal_point)
+      RET_IF_RESULT_NEGATIVE(writer->write(DECIMAL_POINT));
+    if (valid_fraction_digits > 0)
+      RET_IF_RESULT_NEGATIVE(
+          writer->write({fraction_digits, valid_fraction_digits}));
+    if (trailing_zeroes > 0)
+      RET_IF_RESULT_NEGATIVE(writer->write('0', trailing_zeroes));
+    if (padding > 0)
+      RET_IF_RESULT_NEGATIVE(writer->write(' ', padding));
+  } else {
+    // The pattern is (spaces), (sign), (zeroes), integral, (.), (fraction),
+    // (zeroes)
+    if ((padding > 0) &&
+        ((flags & FormatFlags::LEADING_ZEROES) != FormatFlags::LEADING_ZEROES))
+      RET_IF_RESULT_NEGATIVE(writer->write(' ', padding));
+    if (sign_char > 0)
+      RET_IF_RESULT_NEGATIVE(writer->write(sign_char));
+    if ((padding > 0) &&
+        ((flags & FormatFlags::LEADING_ZEROES) == FormatFlags::LEADING_ZEROES))
+      RET_IF_RESULT_NEGATIVE(writer->write('0', padding));
+    RET_IF_RESULT_NEGATIVE(writer->write(integral_str.view()));
+    if (has_decimal_point)
+      RET_IF_RESULT_NEGATIVE(writer->write(DECIMAL_POINT));
+    if (valid_fraction_digits > 0)
+      RET_IF_RESULT_NEGATIVE(
+          writer->write({fraction_digits, valid_fraction_digits}));
+    if (trailing_zeroes > 0)
+      RET_IF_RESULT_NEGATIVE(writer->write('0', trailing_zeroes));
+  }
+  return WRITE_OK;
+}
+
+} // namespace printf_core
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_FIXED_CONVERTER_H
diff --git a/libc/src/stdio/printf_core/float_dec_converter.h b/libc/src/stdio/printf_core/float_dec_converter.h
index b54526d37108..a6c68329e660 100644
--- a/libc/src/stdio/printf_core/float_dec_converter.h
+++ b/libc/src/stdio/printf_core/float_dec_converter.h
@@ -45,9 +45,6 @@ constexpr uint32_t MAX_BLOCK = 999999999;
 // constexpr uint32_t MAX_BLOCK = 999999999999999999;
 constexpr char DECIMAL_POINT = '.';
 
-// This is used to represent which direction the number should be rounded.
-enum class RoundDirection { Up, Down, Even };
-
 LIBC_INLINE RoundDirection get_round_direction(int last_digit, bool truncated,
                                                fputil::Sign sign) {
   switch (fputil::quick_get_round()) {
diff --git a/libc/src/stdio/printf_core/parser.h b/libc/src/stdio/printf_core/parser.h
index 1e7d2e58c924..13fdbf243a22 100644
--- a/libc/src/stdio/printf_core/parser.h
+++ b/libc/src/stdio/printf_core/parser.h
@@ -9,13 +9,19 @@
 #ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PARSER_H
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PARSER_H
 
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/optional.h"
+#include "src/__support/CPP/type_traits.h"
 #include "src/__support/str_to_integer.h"
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/printf_config.h"
 
 #include <stddef.h>
 
+#ifdef LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+#include "src/__support/fixed_point/fx_rep.h"
+#endif // LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+
 namespace LIBC_NAMESPACE {
 namespace printf_core {
 
@@ -28,6 +34,14 @@ template <> struct int_type_of<double> {
 template <> struct int_type_of<long double> {
   using type = fputil::FPBits<long double>::StorageType;
 };
+
+#ifdef LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+template <typename T>
+struct int_type_of<cpp::enable_if<cpp::is_fixed_point_v<T>, T>> {
+  using type = typename fixed_point::FXRep<T>::StorageType;
+};
+#endif // LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+
 template <typename T> using int_type_of_v = typename int_type_of<T>::type;
 
 #ifndef LIBC_COPT_PRINTF_DISABLE_INDEX_MODE
@@ -206,6 +220,25 @@ public:
         }
         break;
 #endif // LIBC_COPT_PRINTF_DISABLE_FLOAT
+#ifdef LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+      // Capitalization represents sign, but we only need to get the right
+      // bitwidth here so we ignore that.
+      case ('r'):
+      case ('R'):
+        // all fract sizes we support are less than 32 bits, and currently doing
+        // va_args with fixed point types just doesn't work.
+        // TODO: Move to fixed point types once va_args supports it.
+        WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, uint32_t, conv_index);
+        break;
+      case ('k'):
+      case ('K'):
+        if (lm == LengthModifier::l) {
+          WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, uint64_t, conv_index);
+        } else {
+          WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, uint32_t, conv_index);
+        }
+        break;
+#endif // LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
 #ifndef LIBC_COPT_PRINTF_DISABLE_WRITE_INT
       case ('n'):
 #endif // LIBC_COPT_PRINTF_DISABLE_WRITE_INT
@@ -399,6 +432,22 @@ private:
       else if (cur_type_desc == type_desc_from_type<long double>())
         args_cur.template next_var<long double>();
 #endif // LIBC_COPT_PRINTF_DISABLE_FLOAT
+#ifdef LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+      // Floating point numbers may be stored separately from the other
+      // arguments.
+      else if (cur_type_desc == type_desc_from_type<short fract>())
+        args_cur.template next_var<short fract>();
+      else if (cur_type_desc == type_desc_from_type<fract>())
+        args_cur.template next_var<fract>();
+      else if (cur_type_desc == type_desc_from_type<long fract>())
+        args_cur.template next_var<long fract>();
+      else if (cur_type_desc == type_desc_from_type<short accum>())
+        args_cur.template next_var<short accum>();
+      else if (cur_type_desc == type_desc_from_type<accum>())
+        args_cur.template next_var<accum>();
+      else if (cur_type_desc == type_desc_from_type<long accum>())
+        args_cur.template next_var<long accum>();
+#endif // LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
       // pointers may be stored separately from normal values.
       else if (cur_type_desc == type_desc_from_type<void *>())
         args_cur.template next_var<void *>();
@@ -528,6 +577,22 @@ private:
             conv_size = type_desc_from_type<long double>();
           break;
 #endif // LIBC_COPT_PRINTF_DISABLE_FLOAT
+#ifdef LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+        // Capitalization represents sign, but we only need to get the right
+        // bitwidth here so we ignore that.
+        case ('r'):
+        case ('R'):
+          conv_size = type_desc_from_type<uint32_t>();
+          break;
+        case ('k'):
+        case ('K'):
+          if (lm == LengthModifier::l) {
+            conv_size = type_desc_from_type<uint64_t>();
+          } else {
+            conv_size = type_desc_from_type<uint32_t>();
+          }
+          break;
+#endif // LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
 #ifndef LIBC_COPT_PRINTF_DISABLE_WRITE_INT
         case ('n'):
 #endif // LIBC_COPT_PRINTF_DISABLE_WRITE_INT
diff --git a/libc/src/stdio/printf_core/printf_config.h b/libc/src/stdio/printf_core/printf_config.h
index e1d9654f3aff..8a48abdd170e 100644
--- a/libc/src/stdio/printf_core/printf_config.h
+++ b/libc/src/stdio/printf_core/printf_config.h
@@ -29,6 +29,13 @@
 #define LIBC_COPT_PRINTF_INDEX_ARR_LEN 128
 #endif
 
+// If fixed point is available and the user hasn't explicitly opted out, then
+// enable fixed point.
+#if defined(LIBC_COMPILER_HAS_FIXED_POINT) &&                                  \
+    !defined(LIBC_COPT_PRINTF_DISABLE_FIXED_POINT)
+#define LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+#endif
+
 // TODO(michaelrj): Provide a proper interface for these options.
 // LIBC_COPT_FLOAT_TO_STR_USE_MEGA_LONG_DOUBLE_TABLE
 // LIBC_COPT_FLOAT_TO_STR_USE_DYADIC_FLOAT
diff --git a/libc/src/sys/epoll/epoll_pwait.h b/libc/src/sys/epoll/epoll_pwait.h
index 9dcb55533009..16105850d694 100644
--- a/libc/src/sys/epoll/epoll_pwait.h
+++ b/libc/src/sys/epoll/epoll_pwait.h
@@ -10,8 +10,8 @@
 #define LLVM_LIBC_SRC_SYS_EPOLL_EPOLL_PWAIT_H
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "include/llvm-libc-types/sigset_t.h"
-// #include "include/llvm-libc-types/struct_epoll_event.h"
+// #include "llvm-libc-types/sigset_t.h"
+// #include "llvm-libc-types/struct_epoll_event.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/epoll_pwait2.h b/libc/src/sys/epoll/epoll_pwait2.h
index 622ede6a0f9f..f7b28d4fbc51 100644
--- a/libc/src/sys/epoll/epoll_pwait2.h
+++ b/libc/src/sys/epoll/epoll_pwait2.h
@@ -10,9 +10,9 @@
 #define LLVM_LIBC_SRC_SYS_EPOLL_EPOLL_PWAIT2_H
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "include/llvm-libc-types/sigset_t.h"
-// #include "include/llvm-libc-types/struct_epoll_event.h"
-// #include "include/llvm-libc-types/struct_timespec.h"
+// #include "llvm-libc-types/sigset_t.h"
+// #include "llvm-libc-types/struct_epoll_event.h"
+// #include "llvm-libc-types/struct_timespec.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/epoll_wait.h b/libc/src/sys/epoll/epoll_wait.h
index d51c9100846c..0dc487bba5bd 100644
--- a/libc/src/sys/epoll/epoll_wait.h
+++ b/libc/src/sys/epoll/epoll_wait.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_SYS_EPOLL_EPOLL_WAIT_H
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "include/llvm-libc-types/struct_epoll_event.h"
+// #include "llvm-libc-types/struct_epoll_event.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/linux/epoll_pwait.cpp b/libc/src/sys/epoll/linux/epoll_pwait.cpp
index ee1b4e66e984..e0c13a7a7960 100644
--- a/libc/src/sys/epoll/linux/epoll_pwait.cpp
+++ b/libc/src/sys/epoll/linux/epoll_pwait.cpp
@@ -15,8 +15,8 @@
 #include <sys/syscall.h> // For syscall numbers.
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "include/llvm-libc-types/sigset_t.h"
-// #include "include/llvm-libc-types/struct_epoll_event.h"
+// #include "llvm-libc-types/sigset_t.h"
+// #include "llvm-libc-types/struct_epoll_event.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/linux/epoll_pwait2.cpp b/libc/src/sys/epoll/linux/epoll_pwait2.cpp
index 671dede2a105..a44b0c2a9f70 100644
--- a/libc/src/sys/epoll/linux/epoll_pwait2.cpp
+++ b/libc/src/sys/epoll/linux/epoll_pwait2.cpp
@@ -15,9 +15,9 @@
 #include <sys/syscall.h> // For syscall numbers.
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "include/llvm-libc-types/sigset_t.h"
-// #include "include/llvm-libc-types/struct_epoll_event.h"
-// #include "include/llvm-libc-types/struct_timespec.h"
+// #include "llvm-libc-types/sigset_t.h"
+// #include "llvm-libc-types/struct_epoll_event.h"
+// #include "llvm-libc-types/struct_timespec.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/linux/epoll_wait.cpp b/libc/src/sys/epoll/linux/epoll_wait.cpp
index 0c43edf76454..b643e2dd720c 100644
--- a/libc/src/sys/epoll/linux/epoll_wait.cpp
+++ b/libc/src/sys/epoll/linux/epoll_wait.cpp
@@ -14,8 +14,8 @@
 #include <sys/syscall.h> // For syscall numbers.
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "include/llvm-libc-types/sigset_t.h"
-// #include "include/llvm-libc-types/struct_epoll_event.h"
+// #include "llvm-libc-types/sigset_t.h"
+// #include "llvm-libc-types/struct_epoll_event.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/test/UnitTest/CMakeLists.txt b/libc/test/UnitTest/CMakeLists.txt
index 4668f0061975..466494f038f4 100644
--- a/libc/test/UnitTest/CMakeLists.txt
+++ b/libc/test/UnitTest/CMakeLists.txt
@@ -26,7 +26,8 @@ function(add_unittest_framework_library name)
       ${TEST_LIB_SRCS}
       ${TEST_LIB_HDRS}
     )
-    target_include_directories(${lib} PUBLIC ${LIBC_SOURCE_DIR})
+    target_include_directories(${lib} PUBLIC 
+                               ${LIBC_SOURCE_DIR} ${LIBC_SOURCE_DIR}/include)
     list(APPEND compile_options -fno-exceptions -fno-rtti)
     if(TARGET libc.src.time.clock)
       target_compile_definitions(${lib} PRIVATE TARGET_SUPPORTS_CLOCK)
diff --git a/libc/test/UnitTest/LibcTest.cpp b/libc/test/UnitTest/LibcTest.cpp
index 7b0e4fca8368..babd44f9b206 100644
--- a/libc/test/UnitTest/LibcTest.cpp
+++ b/libc/test/UnitTest/LibcTest.cpp
@@ -8,7 +8,7 @@
 
 #include "LibcTest.h"
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/string.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/UInt128.h"
diff --git a/libc/test/include/stdbit_test.cpp b/libc/test/include/stdbit_test.cpp
index 22d5533df1e8..84a4cde18b9f 100644
--- a/libc/test/include/stdbit_test.cpp
+++ b/libc/test/include/stdbit_test.cpp
@@ -71,9 +71,24 @@ unsigned stdc_first_trailing_one_ul(unsigned long) noexcept { return 0x1DU; }
 unsigned stdc_first_trailing_one_ull(unsigned long long) noexcept {
   return 0x1FU;
 }
+unsigned stdc_count_zeros_uc(unsigned char) noexcept { return 0x2AU; }
+unsigned stdc_count_zeros_us(unsigned short) noexcept { return 0x2BU; }
+unsigned stdc_count_zeros_ui(unsigned) noexcept { return 0x2CU; }
+unsigned stdc_count_zeros_ul(unsigned long) noexcept { return 0x2DU; }
+unsigned stdc_count_zeros_ull(unsigned long long) noexcept { return 0x2FU; }
+unsigned stdc_count_ones_uc(unsigned char) noexcept { return 0x3AU; }
+unsigned stdc_count_ones_us(unsigned short) noexcept { return 0x3BU; }
+unsigned stdc_count_ones_ui(unsigned) noexcept { return 0x3CU; }
+unsigned stdc_count_ones_ul(unsigned long) noexcept { return 0x3DU; }
+unsigned stdc_count_ones_ull(unsigned long long) noexcept { return 0x3FU; }
+bool stdc_has_single_bit_uc(unsigned char) noexcept { return false; }
+bool stdc_has_single_bit_us(unsigned short) noexcept { return false; }
+bool stdc_has_single_bit_ui(unsigned) noexcept { return false; }
+bool stdc_has_single_bit_ul(unsigned long) noexcept { return false; }
+bool stdc_has_single_bit_ull(unsigned long long) noexcept { return false; }
 }
 
-#include "include/llvm-libc-macros/stdbit-macros.h"
+#include "llvm-libc-macros/stdbit-macros.h"
 
 TEST(LlvmLibcStdbitTest, TypeGenericMacroLeadingZeros) {
   EXPECT_EQ(stdc_leading_zeros(static_cast<unsigned char>(0U)), 0xAAU);
@@ -138,3 +153,27 @@ TEST(LlvmLibcStdbitTest, TypeGenericMacroFirstTrailingOne) {
   EXPECT_EQ(stdc_first_trailing_one(0UL), 0x1DU);
   EXPECT_EQ(stdc_first_trailing_one(0ULL), 0x1FU);
 }
+
+TEST(LlvmLibcStdbitTest, TypeGenericMacroCountZeros) {
+  EXPECT_EQ(stdc_count_zeros(static_cast<unsigned char>(0U)), 0x2AU);
+  EXPECT_EQ(stdc_count_zeros(static_cast<unsigned short>(0U)), 0x2BU);
+  EXPECT_EQ(stdc_count_zeros(0U), 0x2CU);
+  EXPECT_EQ(stdc_count_zeros(0UL), 0x2DU);
+  EXPECT_EQ(stdc_count_zeros(0ULL), 0x2FU);
+}
+
+TEST(LlvmLibcStdbitTest, TypeGenericMacroCountOnes) {
+  EXPECT_EQ(stdc_count_ones(static_cast<unsigned char>(0U)), 0x3AU);
+  EXPECT_EQ(stdc_count_ones(static_cast<unsigned short>(0U)), 0x3BU);
+  EXPECT_EQ(stdc_count_ones(0U), 0x3CU);
+  EXPECT_EQ(stdc_count_ones(0UL), 0x3DU);
+  EXPECT_EQ(stdc_count_ones(0ULL), 0x3FU);
+}
+
+TEST(LlvmLibcStdbitTest, TypeGenericMacroHasSingleBit) {
+  EXPECT_EQ(stdc_has_single_bit(static_cast<unsigned char>(1U)), false);
+  EXPECT_EQ(stdc_has_single_bit(static_cast<unsigned short>(1U)), false);
+  EXPECT_EQ(stdc_has_single_bit(1U), false);
+  EXPECT_EQ(stdc_has_single_bit(1UL), false);
+  EXPECT_EQ(stdc_has_single_bit(1ULL), false);
+}
diff --git a/libc/test/include/stdckdint_test.cpp b/libc/test/include/stdckdint_test.cpp
index 1180a6de9efe..5ac8c95f4ef2 100644
--- a/libc/test/include/stdckdint_test.cpp
+++ b/libc/test/include/stdckdint_test.cpp
@@ -8,7 +8,7 @@
 
 #include "test/UnitTest/Test.h"
 
-#include "include/llvm-libc-macros/stdckdint-macros.h"
+#include "llvm-libc-macros/stdckdint-macros.h"
 
 TEST(LlvmLibcStdCkdIntTest, Add) {
   int result;
diff --git a/libc/test/integration/startup/CMakeLists.txt b/libc/test/integration/startup/CMakeLists.txt
index fb5d6bc787cc..08c0d978602b 100644
--- a/libc/test/integration/startup/CMakeLists.txt
+++ b/libc/test/integration/startup/CMakeLists.txt
@@ -31,6 +31,7 @@ function(add_startup_test target_name)
     ${fq_target_name}
     PRIVATE
       ${LIBC_SOURCE_DIR}
+      ${LIBC_SOURCE_DIR}/include
       ${LIBC_BUILD_DIR}
       ${LIBC_BUILD_DIR}/include
   )
diff --git a/libc/test/integration/startup/gpu/rpc_interface_test.cpp b/libc/test/integration/startup/gpu/rpc_interface_test.cpp
index 674e2cc1ed74..7bbd7085fc2f 100644
--- a/libc/test/integration/startup/gpu/rpc_interface_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_interface_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+#include "llvm-libc-types/test_rpc_opcodes_t.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "test/IntegrationTest/test.h"
diff --git a/libc/test/integration/startup/gpu/rpc_stream_test.cpp b/libc/test/integration/startup/gpu/rpc_stream_test.cpp
index 09a4ae67256e..9401f822904d 100644
--- a/libc/test/integration/startup/gpu/rpc_stream_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_stream_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+#include "llvm-libc-types/test_rpc_opcodes_t.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "src/__support/integer_to_string.h"
diff --git a/libc/test/integration/startup/gpu/rpc_test.cpp b/libc/test/integration/startup/gpu/rpc_test.cpp
index 4032d890c53e..bb36b6cedb63 100644
--- a/libc/test/integration/startup/gpu/rpc_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+#include "llvm-libc-types/test_rpc_opcodes_t.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "test/IntegrationTest/test.h"
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index c5634866f839..7200ac276fe5 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -1,17 +1,14 @@
 add_custom_target(libc-support-tests)
 
-# FIXME: These tests are currently broken on the GPU.
-if(NOT LIBC_TARGET_OS_IS_GPU)
-  add_libc_test(
-    blockstore_test
-    SUITE
-      libc-support-tests
-    SRCS
-      blockstore_test.cpp
-    DEPENDS
-      libc.src.__support.blockstore
-  )
-endif()
+add_libc_test(
+  blockstore_test
+  SUITE
+    libc-support-tests
+  SRCS
+    blockstore_test.cpp
+  DEPENDS
+    libc.src.__support.blockstore
+)
 
 add_libc_test(
   endian_test
@@ -42,8 +39,6 @@ add_libc_test(
   DEPENDS
     libc.src.__support.high_precision_decimal
     libc.src.__support.uint128
-  # FIXME Test segfaults on gfx90a GPU
-  UNIT_TEST_ONLY
 )
 
 add_libc_test(
diff --git a/libc/test/src/__support/fixed_point/fx_bits_test.cpp b/libc/test/src/__support/fixed_point/fx_bits_test.cpp
index 58627816eb8d..5670687273d5 100644
--- a/libc/test/src/__support/fixed_point/fx_bits_test.cpp
+++ b/libc/test/src/__support/fixed_point/fx_bits_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 #include "src/__support/fixed_point/fx_bits.h"
 #include "src/__support/integer_literals.h"
diff --git a/libc/test/src/__support/memory_size_test.cpp b/libc/test/src/__support/memory_size_test.cpp
index 93ef3711d40e..1c8f1ce87415 100644
--- a/libc/test/src/__support/memory_size_test.cpp
+++ b/libc/test/src/__support/memory_size_test.cpp
@@ -49,6 +49,13 @@ TEST(LlvmLibcMemSizeTest, Addition) {
   ASSERT_FALSE((max + SafeMemSize{static_cast<size_t>(1)}).valid());
   ASSERT_FALSE((third + third + third + third).valid());
   ASSERT_FALSE((half + half + half).valid());
+
+  ASSERT_FALSE((SafeMemSize{static_cast<size_t>(-1)} +
+                SafeMemSize{static_cast<size_t>(2)})
+                   .valid());
+  ASSERT_FALSE((SafeMemSize{static_cast<size_t>(2)} +
+                SafeMemSize{static_cast<size_t>(-1)})
+                   .valid());
 }
 
 TEST(LlvmLibcMemSizeTest, Multiplication) {
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 81d2e1e55b55..ad7dfdb3dfd9 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -758,40 +758,37 @@ add_fp_unittest(
     libc.src.__support.FPUtil.basic_operations
 )
 
-# FIXME: These tests are currently broken for NVPTX.
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
-  add_fp_unittest(
-    ilogb_test
-    SUITE
-      libc-math-unittests
-    SRCS
-      ilogb_test.cpp
-    HDRS
-      ILogbTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.ilogb
-      libc.src.__support.CPP.limits
-      libc.src.__support.FPUtil.fp_bits
-      libc.src.__support.FPUtil.manipulation_functions
-  )
+add_fp_unittest(
+  ilogb_test
+  SUITE
+    libc-math-unittests
+  SRCS
+    ilogb_test.cpp
+  HDRS
+    ILogbTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.ilogb
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.manipulation_functions
+)
 
-  add_fp_unittest(
-    ilogbf_test
-    SUITE
-      libc-math-unittests
-    SRCS
-      ilogbf_test.cpp
-    HDRS
-      ILogbTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.ilogbf
-      libc.src.__support.CPP.limits
-      libc.src.__support.FPUtil.fp_bits
-      libc.src.__support.FPUtil.manipulation_functions
-  )
-endif()
+add_fp_unittest(
+  ilogbf_test
+  SUITE
+    libc-math-unittests
+  SRCS
+    ilogbf_test.cpp
+  HDRS
+    ILogbTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.ilogbf
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.manipulation_functions
+)
 
 add_fp_unittest(
   ilogbl_test
@@ -989,92 +986,89 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
-# FIXME: These tests are currently broken on the GPU.
-if(NOT LIBC_TARGET_OS_IS_GPU)
-  add_fp_unittest(
-    fminf_test
-    SUITE
-      libc-math-unittests
-    SRCS
-      fminf_test.cpp
-    HDRS
-      FMinTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.fminf
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  fminf_test
+  SUITE
+    libc-math-unittests
+  SRCS
+    fminf_test.cpp
+  HDRS
+    FMinTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.fminf
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    fmin_test
-    SUITE
-      libc-math-unittests
-    SRCS
-      fmin_test.cpp
-    HDRS
-      FMinTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.fmin
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  fmin_test
+  SUITE
+    libc-math-unittests
+  SRCS
+    fmin_test.cpp
+  HDRS
+    FMinTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.fmin
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    fminl_test
-    SUITE
-      libc-math-unittests
-    SRCS
-      fminl_test.cpp
-    HDRS
-      FMinTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.fminl
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  fminl_test
+  SUITE
+    libc-math-unittests
+  SRCS
+    fminl_test.cpp
+  HDRS
+    FMinTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.fminl
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    fmaxf_test
-    SUITE
-      libc-math-unittests
-    SRCS
-      fmaxf_test.cpp
-    HDRS
-      FMaxTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.fmaxf
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  fmaxf_test
+  SUITE
+    libc-math-unittests
+  SRCS
+    fmaxf_test.cpp
+  HDRS
+    FMaxTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.fmaxf
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    fmax_test
-    SUITE
-      libc-math-unittests
-    SRCS
-      fmax_test.cpp
-    HDRS
-      FMaxTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.fmax
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  fmax_test
+  SUITE
+    libc-math-unittests
+  SRCS
+    fmax_test.cpp
+  HDRS
+    FMaxTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.fmax
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    fmaxl_test
-    SUITE
-      libc-math-unittests
-    SRCS
-      fmaxl_test.cpp
-    HDRS
-      FMaxTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.fmaxl
-      libc.src.__support.FPUtil.fp_bits
-  )
-endif()
+add_fp_unittest(
+  fmaxl_test
+  SUITE
+    libc-math-unittests
+  SRCS
+    fmaxl_test.cpp
+  HDRS
+    FMaxTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.fmaxl
+    libc.src.__support.FPUtil.fp_bits
+)
 
 add_fp_unittest(
   sqrtf_test
@@ -1234,38 +1228,35 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
-# FIXME: These tests are currently spurious for NVPTX.
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
-  add_fp_unittest(
-    nextafter_test
-    SUITE
-      libc-math-unittests
-    SRCS
-      nextafter_test.cpp
-    HDRS
-      NextAfterTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.nextafter
-      libc.src.__support.FPUtil.basic_operations
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  nextafter_test
+  SUITE
+    libc-math-unittests
+  SRCS
+    nextafter_test.cpp
+  HDRS
+    NextAfterTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.nextafter
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    nextafterf_test
-    SUITE
-      libc-math-unittests
-    SRCS
-      nextafterf_test.cpp
-    HDRS
-      NextAfterTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.nextafterf
-      libc.src.__support.FPUtil.basic_operations
-      libc.src.__support.FPUtil.fp_bits
-  )
-endif()
+add_fp_unittest(
+  nextafterf_test
+  SUITE
+    libc-math-unittests
+  SRCS
+    nextafterf_test.cpp
+  HDRS
+    NextAfterTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.nextafterf
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.FPUtil.fp_bits
+)
 
 add_fp_unittest(
   nextafterl_test
diff --git a/libc/test/src/math/differential_testing/CMakeLists.txt b/libc/test/src/math/differential_testing/CMakeLists.txt
index 878f81f1d573..36bfdca1a442 100644
--- a/libc/test/src/math/differential_testing/CMakeLists.txt
+++ b/libc/test/src/math/differential_testing/CMakeLists.txt
@@ -47,6 +47,7 @@ function(add_diff_binary target_name)
     ${fq_target_name}
     PRIVATE
       ${LIBC_SOURCE_DIR}
+      ${LIBC_SOURCE_DIR}/include
   )
   if(DIFF_COMPILE_OPTIONS)
     target_compile_options(
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 0d6b33bd7d66..be1810944495 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -65,8 +65,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.fabs
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -81,14 +79,10 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.fabsf
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
   fabsl_test
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
   SUITE
     libc-math-smoke-tests
   SRCS
@@ -103,8 +97,6 @@ add_fp_unittest(
 
 add_fp_unittest(
   fabsf128_test
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
   SUITE
     libc-math-smoke-tests
   SRCS
@@ -129,8 +121,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.trunc
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -145,8 +135,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.truncf
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -161,8 +149,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.truncl
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -177,8 +163,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.truncf128
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -193,8 +177,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.ceil
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -209,8 +191,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.ceilf
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -225,8 +205,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.ceill
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -241,8 +219,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.ceilf128
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -257,8 +233,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.floor
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -273,8 +247,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.floorf
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -289,8 +261,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.floorl
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -305,8 +275,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.floorf128
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -321,8 +289,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.round
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -337,8 +303,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.roundf
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -353,8 +317,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.roundl
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -369,8 +331,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.roundf128
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -389,8 +349,6 @@ add_fp_unittest(
     libc.src.fenv.fetestexcept
     libc.src.math.lround
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -409,8 +367,6 @@ add_fp_unittest(
     libc.src.fenv.fetestexcept
     libc.src.math.lroundf
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -429,8 +385,6 @@ add_fp_unittest(
     libc.src.fenv.fetestexcept
     libc.src.math.lroundl
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -449,8 +403,6 @@ add_fp_unittest(
     libc.src.fenv.fetestexcept
     libc.src.math.llround
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -469,8 +421,6 @@ add_fp_unittest(
     libc.src.fenv.fetestexcept
     libc.src.math.llroundf
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -489,8 +439,6 @@ add_fp_unittest(
     libc.src.fenv.fetestexcept
     libc.src.math.llroundl
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -718,8 +666,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.copysign
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -734,8 +680,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.copysignf
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -750,8 +694,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.copysignl
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -766,8 +708,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.copysignf128
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -818,40 +758,35 @@ add_fp_unittest(
     libc.src.math.frexpf128
 )
 
-# FIXME: These tests are currently broken for NVPTX.
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
-  add_fp_unittest(
-    ilogb_test
-    SUITE
-      libc-math-smoke-tests
-    SRCS
-      ilogb_test.cpp
-    HDRS
-      ILogbTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.ilogb
-      libc.src.__support.CPP.limits
-      libc.src.__support.FPUtil.fp_bits
-      libc.src.__support.FPUtil.manipulation_functions
-  )
+add_fp_unittest(
+  ilogb_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    ilogb_test.cpp
+  HDRS
+    ILogbTest.h
+  DEPENDS
+    libc.src.math.ilogb
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.manipulation_functions
+)
 
-  add_fp_unittest(
-    ilogbf_test
-    SUITE
-      libc-math-smoke-tests
-    SRCS
-      ilogbf_test.cpp
-    HDRS
-      ILogbTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.ilogbf
-      libc.src.__support.CPP.limits
-      libc.src.__support.FPUtil.fp_bits
-      libc.src.__support.FPUtil.manipulation_functions
-  )
-endif()
+add_fp_unittest(
+  ilogbf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    ilogbf_test.cpp
+  HDRS
+    ILogbTest.h
+  DEPENDS
+    libc.src.math.ilogbf
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.manipulation_functions
+)
 
 add_fp_unittest(
   ilogbl_test
@@ -862,7 +797,6 @@ add_fp_unittest(
   HDRS
     ILogbTest.h
   DEPENDS
-    libc.include.math
     libc.src.math.ilogbl
     libc.src.__support.CPP.limits
     libc.src.__support.FPUtil.fp_bits
@@ -870,6 +804,81 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
+  ilogbf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    ilogbf128_test.cpp
+  HDRS
+    ILogbTest.h
+  DEPENDS
+    libc.src.math.ilogbf128
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  llogb_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    llogb_test.cpp
+  HDRS
+    ILogbTest.h
+  DEPENDS
+    libc.src.math.llogb
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  llogbf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    llogbf_test.cpp
+  HDRS
+    ILogbTest.h
+  DEPENDS
+    libc.src.math.llogbf
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  llogbl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    llogbl_test.cpp
+  HDRS
+    ILogbTest.h
+  DEPENDS
+    libc.src.math.llogbl
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  llogbf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    llogbf128_test.cpp
+  HDRS
+    ILogbTest.h
+  DEPENDS
+    libc.src.math.llogbf128
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
   ldexp_test
   SUITE
     libc-math-smoke-tests
@@ -936,7 +945,6 @@ add_fp_unittest(
   SRCS
     logb_test.cpp
   DEPENDS
-    libc.include.math
     libc.src.math.logb
     libc.src.__support.FPUtil.manipulation_functions
 )
@@ -948,7 +956,6 @@ add_fp_unittest(
   SRCS
     logbf_test.cpp
   DEPENDS
-    libc.include.math
     libc.src.math.logbf
     libc.src.__support.FPUtil.manipulation_functions
 )
@@ -962,12 +969,22 @@ add_fp_unittest(
   HDRS
     LogbTest.h
   DEPENDS
-    libc.include.math
     libc.src.math.logbl
     libc.src.__support.FPUtil.manipulation_functions
 )
 
 add_fp_unittest(
+  logbf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    logbf128_test.cpp
+  DEPENDS
+    libc.src.math.logbf128
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
   modf_test
   SUITE
     libc-math-smoke-tests
@@ -1072,112 +1089,109 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
-# FIXME: These tests are currently broken on the GPU.
-if(NOT LIBC_TARGET_OS_IS_GPU)
-  add_fp_unittest(
-    fminf_test
-    SUITE
-      libc-math-smoke-tests
-    SRCS
-      fminf_test.cpp
-    HDRS
-      FMinTest.h
-    DEPENDS
-      libc.src.math.fminf
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  fminf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fminf_test.cpp
+  HDRS
+    FMinTest.h
+  DEPENDS
+    libc.src.math.fminf
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    fmin_test
-    SUITE
-      libc-math-smoke-tests
-    SRCS
-      fmin_test.cpp
-    HDRS
-      FMinTest.h
-    DEPENDS
-      libc.src.math.fmin
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  fmin_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fmin_test.cpp
+  HDRS
+    FMinTest.h
+  DEPENDS
+    libc.src.math.fmin
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    fminl_test
-    SUITE
-      libc-math-smoke-tests
-    SRCS
-      fminl_test.cpp
-    HDRS
-      FMinTest.h
-    DEPENDS
-      libc.src.math.fminl
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  fminl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fminl_test.cpp
+  HDRS
+    FMinTest.h
+  DEPENDS
+    libc.src.math.fminl
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    fminf128_test
-    SUITE
-      libc-math-smoke-tests
-    SRCS
-      fminf128_test.cpp
-    HDRS
-      FMinTest.h
-    DEPENDS
-      libc.src.math.fminf128
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  fminf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fminf128_test.cpp
+  HDRS
+    FMinTest.h
+  DEPENDS
+    libc.src.math.fminf128
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    fmaxf_test
-    SUITE
-      libc-math-smoke-tests
-    SRCS
-      fmaxf_test.cpp
-    HDRS
-      FMaxTest.h
-    DEPENDS
-      libc.src.math.fmaxf
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  fmaxf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fmaxf_test.cpp
+  HDRS
+    FMaxTest.h
+  DEPENDS
+    libc.src.math.fmaxf
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    fmax_test
-    SUITE
-      libc-math-smoke-tests
-    SRCS
-      fmax_test.cpp
-    HDRS
-      FMaxTest.h
-    DEPENDS
-      libc.src.math.fmax
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  fmax_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fmax_test.cpp
+  HDRS
+    FMaxTest.h
+  DEPENDS
+    libc.src.math.fmax
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    fmaxl_test
-    SUITE
-      libc-math-smoke-tests
-    SRCS
-      fmaxl_test.cpp
-    HDRS
-      FMaxTest.h
-    DEPENDS
-      libc.src.math.fmaxl
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  fmaxl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fmaxl_test.cpp
+  HDRS
+    FMaxTest.h
+  DEPENDS
+    libc.src.math.fmaxl
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    fmaxf128_test
-    SUITE
-      libc-math-smoke-tests
-    SRCS
-      fmaxf128_test.cpp
-    HDRS
-      FMaxTest.h
-    DEPENDS
-      libc.src.math.fmaxf128
-      libc.src.__support.FPUtil.fp_bits
-  )
-endif()
+add_fp_unittest(
+  fmaxf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fmaxf128_test.cpp
+  HDRS
+    FMaxTest.h
+  DEPENDS
+    libc.src.math.fmaxf128
+    libc.src.__support.FPUtil.fp_bits
+)
 
 add_fp_unittest(
   sqrtf_test
@@ -1189,8 +1203,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.sqrtf
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -1203,8 +1215,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.sqrt
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -1217,8 +1227,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.sqrtl
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -1231,8 +1239,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.sqrtf128
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -1247,8 +1253,6 @@ add_fp_unittest(
     libc.src.__support.FPUtil.generic.sqrt
   COMPILE_OPTIONS
     -O3
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -1263,8 +1267,6 @@ add_fp_unittest(
     libc.src.__support.FPUtil.generic.sqrt
   COMPILE_OPTIONS
     -O3
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -1279,8 +1281,6 @@ add_fp_unittest(
     libc.src.__support.FPUtil.generic.sqrt
   COMPILE_OPTIONS
     -O3
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -1295,8 +1295,6 @@ add_fp_unittest(
     libc.src.__support.FPUtil.generic.sqrt
   COMPILE_OPTIONS
     -O3
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -1416,38 +1414,35 @@ add_fp_unittest(
   UNIT_TEST_ONLY
 )
 
-# FIXME: These tests are currently spurious for NVPTX.
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
-  add_fp_unittest(
-    nextafter_test
-    SUITE
-      libc-math-smoke-tests
-    SRCS
-      nextafter_test.cpp
-    HDRS
-      NextAfterTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.nextafter
-      libc.src.__support.FPUtil.basic_operations
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  nextafter_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nextafter_test.cpp
+  HDRS
+    NextAfterTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.nextafter
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    nextafterf_test
-    SUITE
-      libc-math-smoke-tests
-    SRCS
-      nextafterf_test.cpp
-    HDRS
-      NextAfterTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.nextafterf
-      libc.src.__support.FPUtil.basic_operations
-      libc.src.__support.FPUtil.fp_bits
-  )
-endif()
+add_fp_unittest(
+  nextafterf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nextafterf_test.cpp
+  HDRS
+    NextAfterTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.nextafterf
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.FPUtil.fp_bits
+)
 
 add_fp_unittest(
   nextafterl_test
diff --git a/libc/test/src/math/smoke/ILogbTest.h b/libc/test/src/math/smoke/ILogbTest.h
index 3e2db33e2c05..cbee25b139d4 100644
--- a/libc/test/src/math/smoke/ILogbTest.h
+++ b/libc/test/src/math/smoke/ILogbTest.h
@@ -13,101 +13,110 @@
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
+template <typename OutType, typename InType>
 class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
+  using FPBits = LIBC_NAMESPACE::fputil::FPBits<InType>;
+  using StorageType = typename FPBits::StorageType;
+  using Sign = LIBC_NAMESPACE::fputil::Sign;
+
 public:
-  template <typename T> struct ILogbFunc {
-    typedef int (*Func)(T);
-  };
-
-  template <typename T>
-  void test_special_numbers(typename ILogbFunc<T>::Func func) {
-    using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
-    using Sign = LIBC_NAMESPACE::fputil::Sign;
-    EXPECT_EQ(FP_ILOGB0, func(FPBits::zero(Sign::POS).get_val()));
-    EXPECT_EQ(FP_ILOGB0, func(FPBits::zero(Sign::NEG).get_val()));
-    EXPECT_EQ(FP_ILOGBNAN, func(FPBits::quiet_nan().get_val()));
-    EXPECT_EQ(INT_MAX, func(FPBits::inf(Sign::POS).get_val()));
-    EXPECT_EQ(INT_MAX, func(FPBits::inf(Sign::NEG).get_val()));
+  typedef OutType (*Func)(InType);
+
+  void test_special_numbers(Func func) {
+    EXPECT_EQ(LIBC_NAMESPACE::fputil::IntLogbConstants<OutType>::FP_LOGB0,
+              func(FPBits::zero(Sign::POS).get_val()));
+    EXPECT_EQ(LIBC_NAMESPACE::fputil::IntLogbConstants<OutType>::FP_LOGB0,
+              func(FPBits::zero(Sign::NEG).get_val()));
+    EXPECT_EQ(LIBC_NAMESPACE::fputil::IntLogbConstants<OutType>::FP_LOGBNAN,
+              func(FPBits::quiet_nan().get_val()));
+    EXPECT_EQ(LIBC_NAMESPACE::fputil::IntLogbConstants<OutType>::T_MAX,
+              func(FPBits::inf(Sign::POS).get_val()));
+    EXPECT_EQ(LIBC_NAMESPACE::fputil::IntLogbConstants<OutType>::T_MAX,
+              func(FPBits::inf(Sign::NEG).get_val()));
   }
 
-  template <typename T>
-  void test_powers_of_two(typename ILogbFunc<T>::Func func) {
-    EXPECT_EQ(0, func(T(1.0)));
-    EXPECT_EQ(0, func(T(-1.0)));
+  void test_powers_of_two(Func func) {
+    EXPECT_EQ(OutType(0), func(InType(1.0)));
+    EXPECT_EQ(OutType(0), func(InType(-1.0)));
 
-    EXPECT_EQ(1, func(T(2.0)));
-    EXPECT_EQ(1, func(T(-2.0)));
+    EXPECT_EQ(OutType(1), func(InType(2.0)));
+    EXPECT_EQ(OutType(1), func(InType(-2.0)));
 
-    EXPECT_EQ(2, func(T(4.0)));
-    EXPECT_EQ(2, func(T(-4.0)));
+    EXPECT_EQ(OutType(2), func(InType(4.0)));
+    EXPECT_EQ(OutType(2), func(InType(-4.0)));
 
-    EXPECT_EQ(3, func(T(8.0)));
-    EXPECT_EQ(3, func(-8.0));
+    EXPECT_EQ(OutType(3), func(InType(8.0)));
+    EXPECT_EQ(OutType(3), func(-8.0));
 
-    EXPECT_EQ(4, func(16.0));
-    EXPECT_EQ(4, func(-16.0));
+    EXPECT_EQ(OutType(4), func(16.0));
+    EXPECT_EQ(OutType(4), func(-16.0));
 
-    EXPECT_EQ(5, func(32.0));
-    EXPECT_EQ(5, func(-32.0));
+    EXPECT_EQ(OutType(5), func(32.0));
+    EXPECT_EQ(OutType(5), func(-32.0));
   }
 
-  template <typename T>
-  void test_some_integers(typename ILogbFunc<T>::Func func) {
-    EXPECT_EQ(1, func(T(3.0)));
-    EXPECT_EQ(1, func(T(-3.0)));
+  void test_some_integers(Func func) {
+    EXPECT_EQ(OutType(1), func(InType(3.0)));
+    EXPECT_EQ(OutType(1), func(InType(-3.0)));
 
-    EXPECT_EQ(2, func(T(7.0)));
-    EXPECT_EQ(2, func(T(-7.0)));
+    EXPECT_EQ(OutType(2), func(InType(7.0)));
+    EXPECT_EQ(OutType(2), func(InType(-7.0)));
 
-    EXPECT_EQ(3, func(T(10.0)));
-    EXPECT_EQ(3, func(T(-10.0)));
+    EXPECT_EQ(OutType(3), func(InType(10.0)));
+    EXPECT_EQ(OutType(3), func(InType(-10.0)));
 
-    EXPECT_EQ(4, func(T(31.0)));
-    EXPECT_EQ(4, func(-31.0));
+    EXPECT_EQ(OutType(4), func(InType(31.0)));
+    EXPECT_EQ(OutType(4), func(-31.0));
 
-    EXPECT_EQ(5, func(55.0));
-    EXPECT_EQ(5, func(-55.0));
+    EXPECT_EQ(OutType(5), func(55.0));
+    EXPECT_EQ(OutType(5), func(-55.0));
   }
 
-  template <typename T>
-  void test_subnormal_range(typename ILogbFunc<T>::Func func) {
-    using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
-    using StorageType = typename FPBits::StorageType;
+  void test_subnormal_range(Func func) {
     constexpr StorageType MIN_SUBNORMAL = FPBits::min_subnormal().uintval();
     constexpr StorageType MAX_SUBNORMAL = FPBits::max_subnormal().uintval();
     constexpr StorageType COUNT = 10'001;
     constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
     for (StorageType v = MIN_SUBNORMAL; v <= MAX_SUBNORMAL; v += STEP) {
-      T x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x == 0.0)
+      FPBits x_bits = FPBits(v);
+      if (x_bits.is_zero() || x_bits.is_inf_or_nan())
         continue;
 
+      InType x = x_bits.get_val();
+
       int exponent;
       LIBC_NAMESPACE::fputil::frexp(x, exponent);
-      ASSERT_EQ(exponent, func(x) + 1);
+      ASSERT_EQ(static_cast<OutType>(exponent), func(x) + OutType(1));
     }
   }
 
-  template <typename T>
-  void test_normal_range(typename ILogbFunc<T>::Func func) {
-    using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
-    using StorageType = typename FPBits::StorageType;
+  void test_normal_range(Func func) {
     constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
     constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
     constexpr StorageType COUNT = 10'001;
     constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
     for (StorageType v = MIN_NORMAL; v <= MAX_NORMAL; v += STEP) {
-      T x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x == 0.0)
+      FPBits x_bits = FPBits(v);
+      if (x_bits.is_zero() || x_bits.is_inf_or_nan())
         continue;
 
+      InType x = x_bits.get_val();
+
       int exponent;
       LIBC_NAMESPACE::fputil::frexp(x, exponent);
-      ASSERT_EQ(exponent, func(x) + 1);
+      ASSERT_EQ(static_cast<OutType>(exponent), func(x) + OutType(1));
     }
   }
 };
 
+#define LIST_INTLOGB_TESTS(OutType, InType, Func)                              \
+  using LlvmLibcIntLogbTest = LlvmLibcILogbTest<OutType, InType>;              \
+  TEST_F(LlvmLibcIntLogbTest, SpecialNumbers) { test_special_numbers(&Func); } \
+  TEST_F(LlvmLibcIntLogbTest, PowersOfTwo) { test_powers_of_two(&Func); }      \
+  TEST_F(LlvmLibcIntLogbTest, SomeIntegers) { test_some_integers(&Func); }     \
+  TEST_F(LlvmLibcIntLogbTest, SubnormalRange) { test_subnormal_range(&Func); } \
+  TEST_F(LlvmLibcIntLogbTest, NormalRange) { test_normal_range(&Func); }       \
+  static_assert(true)
+
 #endif // LLVM_LIBC_TEST_SRC_MATH_ILOGBTEST_H
diff --git a/libc/test/src/math/smoke/LogbTest.h b/libc/test/src/math/smoke/LogbTest.h
index e2698e2b7b81..01e1050b4c4f 100644
--- a/libc/test/src/math/smoke/LogbTest.h
+++ b/libc/test/src/math/smoke/LogbTest.h
@@ -10,8 +10,6 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
-
 template <typename T> class LogbTest : public LIBC_NAMESPACE::testing::Test {
 
   DECLARE_SPECIAL_CONSTANTS(T)
@@ -72,10 +70,12 @@ public:
     constexpr StorageType COUNT = 100'000;
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-      T x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x == 0.0l)
+      FPBits x_bits = FPBits(v);
+      if (x_bits.is_zero() || x_bits.is_inf_or_nan())
         continue;
 
+      T x = x_bits.get_val();
+
       int exponent;
       LIBC_NAMESPACE::fputil::frexp(x, exponent);
       ASSERT_FP_EQ(T(exponent), func(x) + T(1.0));
diff --git a/libc/test/src/math/smoke/ilogb_test.cpp b/libc/test/src/math/smoke/ilogb_test.cpp
index 7011c43386e6..67c7939608e8 100644
--- a/libc/test/src/math/smoke/ilogb_test.cpp
+++ b/libc/test/src/math/smoke/ilogb_test.cpp
@@ -8,29 +8,6 @@
 
 #include "ILogbTest.h"
 
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "src/math/ilogb.h"
-#include "test/UnitTest/FPMatcher.h"
-#include "test/UnitTest/Test.h"
-#include <math.h>
 
-TEST_F(LlvmLibcILogbTest, SpecialNumbers_ilogb) {
-  test_special_numbers<double>(&LIBC_NAMESPACE::ilogb);
-}
-
-TEST_F(LlvmLibcILogbTest, PowersOfTwo_ilogb) {
-  test_powers_of_two<double>(&LIBC_NAMESPACE::ilogb);
-}
-
-TEST_F(LlvmLibcILogbTest, SomeIntegers_ilogb) {
-  test_some_integers<double>(&LIBC_NAMESPACE::ilogb);
-}
-
-TEST_F(LlvmLibcILogbTest, SubnormalRange_ilogb) {
-  test_subnormal_range<double>(&LIBC_NAMESPACE::ilogb);
-}
-
-TEST_F(LlvmLibcILogbTest, NormalRange_ilogb) {
-  test_normal_range<double>(&LIBC_NAMESPACE::ilogb);
-}
+LIST_INTLOGB_TESTS(int, double, LIBC_NAMESPACE::ilogb);
diff --git a/libc/test/src/math/smoke/ilogbf128_test.cpp b/libc/test/src/math/smoke/ilogbf128_test.cpp
new file mode 100644
index 000000000000..21ed0dd112b9
--- /dev/null
+++ b/libc/test/src/math/smoke/ilogbf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for ilogbf128 -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ILogbTest.h"
+
+#include "src/math/ilogbf128.h"
+
+LIST_INTLOGB_TESTS(int, float128, LIBC_NAMESPACE::ilogbf128);
diff --git a/libc/test/src/math/smoke/ilogbf_test.cpp b/libc/test/src/math/smoke/ilogbf_test.cpp
index dcff8eeb1518..68e6950cdf72 100644
--- a/libc/test/src/math/smoke/ilogbf_test.cpp
+++ b/libc/test/src/math/smoke/ilogbf_test.cpp
@@ -8,29 +8,6 @@
 
 #include "ILogbTest.h"
 
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "src/math/ilogbf.h"
-#include "test/UnitTest/FPMatcher.h"
-#include "test/UnitTest/Test.h"
-#include <math.h>
 
-TEST_F(LlvmLibcILogbTest, SpecialNumbers_ilogbf) {
-  test_special_numbers<float>(&LIBC_NAMESPACE::ilogbf);
-}
-
-TEST_F(LlvmLibcILogbTest, PowersOfTwo_ilogbf) {
-  test_powers_of_two<float>(&LIBC_NAMESPACE::ilogbf);
-}
-
-TEST_F(LlvmLibcILogbTest, SomeIntegers_ilogbf) {
-  test_some_integers<float>(&LIBC_NAMESPACE::ilogbf);
-}
-
-TEST_F(LlvmLibcILogbTest, SubnormalRange_ilogbf) {
-  test_subnormal_range<float>(&LIBC_NAMESPACE::ilogbf);
-}
-
-TEST_F(LlvmLibcILogbTest, NormalRange_ilogbf) {
-  test_normal_range<float>(&LIBC_NAMESPACE::ilogbf);
-}
+LIST_INTLOGB_TESTS(int, float, LIBC_NAMESPACE::ilogbf);
diff --git a/libc/test/src/math/smoke/ilogbl_test.cpp b/libc/test/src/math/smoke/ilogbl_test.cpp
index 29a221ad7f08..afc961f16863 100644
--- a/libc/test/src/math/smoke/ilogbl_test.cpp
+++ b/libc/test/src/math/smoke/ilogbl_test.cpp
@@ -8,29 +8,6 @@
 
 #include "ILogbTest.h"
 
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "src/math/ilogbl.h"
-#include "test/UnitTest/FPMatcher.h"
-#include "test/UnitTest/Test.h"
-#include <math.h>
 
-TEST_F(LlvmLibcILogbTest, SpecialNumbers_ilogbl) {
-  test_special_numbers<long double>(&LIBC_NAMESPACE::ilogbl);
-}
-
-TEST_F(LlvmLibcILogbTest, PowersOfTwo_ilogbl) {
-  test_powers_of_two<long double>(&LIBC_NAMESPACE::ilogbl);
-}
-
-TEST_F(LlvmLibcILogbTest, SomeIntegers_ilogbl) {
-  test_some_integers<long double>(&LIBC_NAMESPACE::ilogbl);
-}
-
-TEST_F(LlvmLibcILogbTest, SubnormalRange_ilogbl) {
-  test_subnormal_range<long double>(&LIBC_NAMESPACE::ilogbl);
-}
-
-TEST_F(LlvmLibcILogbTest, NormalRange_ilogbl) {
-  test_normal_range<long double>(&LIBC_NAMESPACE::ilogbl);
-}
+LIST_INTLOGB_TESTS(int, long double, LIBC_NAMESPACE::ilogbl);
diff --git a/libc/test/src/math/smoke/llogb_test.cpp b/libc/test/src/math/smoke/llogb_test.cpp
new file mode 100644
index 000000000000..3bccded6c3e3
--- /dev/null
+++ b/libc/test/src/math/smoke/llogb_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for llogb -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ILogbTest.h"
+
+#include "src/math/llogb.h"
+
+LIST_INTLOGB_TESTS(long, double, LIBC_NAMESPACE::llogb);
diff --git a/libc/test/src/math/smoke/llogbf128_test.cpp b/libc/test/src/math/smoke/llogbf128_test.cpp
new file mode 100644
index 000000000000..a1d2021d2a37
--- /dev/null
+++ b/libc/test/src/math/smoke/llogbf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for llogbf128 -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ILogbTest.h"
+
+#include "src/math/llogbf128.h"
+
+LIST_INTLOGB_TESTS(long, float128, LIBC_NAMESPACE::llogbf128);
diff --git a/libc/test/src/math/smoke/llogbf_test.cpp b/libc/test/src/math/smoke/llogbf_test.cpp
new file mode 100644
index 000000000000..60c92fbc2c46
--- /dev/null
+++ b/libc/test/src/math/smoke/llogbf_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for llogbf ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ILogbTest.h"
+
+#include "src/math/llogbf.h"
+
+LIST_INTLOGB_TESTS(long, float, LIBC_NAMESPACE::llogbf);
diff --git a/libc/test/src/math/smoke/llogbl_test.cpp b/libc/test/src/math/smoke/llogbl_test.cpp
new file mode 100644
index 000000000000..c698210fc3de
--- /dev/null
+++ b/libc/test/src/math/smoke/llogbl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for llogbl ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ILogbTest.h"
+
+#include "src/math/llogbl.h"
+
+LIST_INTLOGB_TESTS(long, long double, LIBC_NAMESPACE::llogbl);
diff --git a/libc/test/src/math/smoke/logbf128_test.cpp b/libc/test/src/math/smoke/logbf128_test.cpp
new file mode 100644
index 000000000000..49485f8ee714
--- /dev/null
+++ b/libc/test/src/math/smoke/logbf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for logbf128 --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "LogbTest.h"
+
+#include "src/math/logbf128.h"
+
+LIST_LOGB_TESTS(float128, LIBC_NAMESPACE::logbf128)
diff --git a/libc/test/src/stdbit/CMakeLists.txt b/libc/test/src/stdbit/CMakeLists.txt
index 203f48bda99a..a886ee4a3532 100644
--- a/libc/test/src/stdbit/CMakeLists.txt
+++ b/libc/test/src/stdbit/CMakeLists.txt
@@ -9,6 +9,9 @@ set(prefixes
   first_leading_one
   first_trailing_zero
   first_trailing_one
+  count_zeros
+  count_ones
+  has_single_bit
 )
 set(suffixes c s i l ll)
 foreach(prefix IN LISTS prefixes)
diff --git a/libc/test/src/stdbit/stdc_count_ones_uc_test.cpp b/libc/test/src/stdbit/stdc_count_ones_uc_test.cpp
new file mode 100644
index 000000000000..791288154bae
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_count_ones_uc_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_count_ones_uc ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_count_ones_uc.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUcTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_count_ones_uc(0U), 0U);
+}
+
+TEST(LlvmLibcStdcCountOnesUcTest, Ones) {
+  for (unsigned i = 0U; i != UCHAR_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_count_ones_uc(UCHAR_MAX >> i),
+              UCHAR_WIDTH - i);
+}
diff --git a/libc/test/src/stdbit/stdc_count_ones_ui_test.cpp b/libc/test/src/stdbit/stdc_count_ones_ui_test.cpp
new file mode 100644
index 000000000000..198e36658421
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_count_ones_ui_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_count_ones_ui ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_count_ones_ui.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUiTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_count_ones_ui(0), 0U);
+}
+
+TEST(LlvmLibcStdcCountOnesUiTest, Ones) {
+  for (unsigned i = 0U; i != UINT_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_count_ones_ui(UINT_MAX >> i),
+              UINT_WIDTH - i);
+}
diff --git a/libc/test/src/stdbit/stdc_count_ones_ul_test.cpp b/libc/test/src/stdbit/stdc_count_ones_ul_test.cpp
new file mode 100644
index 000000000000..ce9d6eb081d4
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_count_ones_ul_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_count_ones_ul ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_count_ones_ul.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUlTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_count_ones_ul(0UL), 0U);
+}
+
+TEST(LlvmLibcStdcCountOnesUlTest, Ones) {
+  for (unsigned i = 0U; i != ULONG_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_count_ones_ul(ULONG_MAX >> i),
+              ULONG_WIDTH - i);
+}
diff --git a/libc/test/src/stdbit/stdc_count_ones_ull_test.cpp b/libc/test/src/stdbit/stdc_count_ones_ull_test.cpp
new file mode 100644
index 000000000000..a0e69459c5a8
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_count_ones_ull_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_count_ones_ull ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_count_ones_ull.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUllTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_count_ones_ull(0ULL), 0U);
+}
+
+TEST(LlvmLibcStdcCountOnesUllTest, Ones) {
+  for (unsigned i = 0U; i != ULLONG_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_count_ones_ull(ULLONG_MAX >> i),
+              ULLONG_WIDTH - i);
+}
diff --git a/libc/test/src/stdbit/stdc_count_ones_us_test.cpp b/libc/test/src/stdbit/stdc_count_ones_us_test.cpp
new file mode 100644
index 000000000000..19d342606285
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_count_ones_us_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_count_ones_us ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_count_ones_us.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUiTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_count_ones_us(0), 0U);
+}
+
+TEST(LlvmLibcStdcCountOnesUsTest, Ones) {
+  for (unsigned i = 0U; i != USHRT_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_count_ones_us(USHRT_MAX >> i),
+              USHRT_WIDTH - i);
+}
diff --git a/libc/test/src/stdbit/stdc_count_zeros_uc_test.cpp b/libc/test/src/stdbit/stdc_count_zeros_uc_test.cpp
new file mode 100644
index 000000000000..3acf61aed637
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_count_zeros_uc_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_count_zeros_uc ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_count_zeros_uc.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountZerosUcTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_count_zeros_uc(0U),
+            static_cast<unsigned>(UCHAR_WIDTH));
+}
+
+TEST(LlvmLibcStdcCountZerosUcTest, Ones) {
+  for (unsigned i = 0U; i != UCHAR_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_count_zeros_uc(UCHAR_MAX >> i), i);
+}
diff --git a/libc/test/src/stdbit/stdc_count_zeros_ui_test.cpp b/libc/test/src/stdbit/stdc_count_zeros_ui_test.cpp
new file mode 100644
index 000000000000..53ce1c86b40f
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_count_zeros_ui_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_count_zeros_ui ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_count_zeros_ui.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountZerosUiTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_count_zeros_ui(0U),
+            static_cast<unsigned>(UINT_WIDTH));
+}
+
+TEST(LlvmLibcStdcCountZerosUiTest, Ones) {
+  for (unsigned i = 0U; i != UINT_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_count_zeros_ui(UINT_MAX >> i), i);
+}
diff --git a/libc/test/src/stdbit/stdc_count_zeros_ul_test.cpp b/libc/test/src/stdbit/stdc_count_zeros_ul_test.cpp
new file mode 100644
index 000000000000..60f01f316657
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_count_zeros_ul_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_count_zeros_ul ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_count_zeros_ul.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountZerosUlTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_count_zeros_ul(0U),
+            static_cast<unsigned>(ULONG_WIDTH));
+}
+
+TEST(LlvmLibcStdcCountZerosUlTest, Ones) {
+  for (unsigned i = 0U; i != ULONG_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_count_zeros_ul(ULONG_MAX >> i), i);
+}
diff --git a/libc/test/src/stdbit/stdc_count_zeros_ull_test.cpp b/libc/test/src/stdbit/stdc_count_zeros_ull_test.cpp
new file mode 100644
index 000000000000..7da2493c0e8f
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_count_zeros_ull_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_count_zeros_ull --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_count_zeros_ull.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountZerosUllTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_count_zeros_ull(0U),
+            static_cast<unsigned>(ULLONG_WIDTH));
+}
+
+TEST(LlvmLibcStdcCountZerosUllTest, Ones) {
+  for (unsigned i = 0U; i != ULLONG_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_count_zeros_ull(ULLONG_MAX >> i), i);
+}
diff --git a/libc/test/src/stdbit/stdc_count_zeros_us_test.cpp b/libc/test/src/stdbit/stdc_count_zeros_us_test.cpp
new file mode 100644
index 000000000000..e8690cb989e3
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_count_zeros_us_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_count_zeros_us ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_count_zeros_us.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountZerosUsTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_count_zeros_us(0U),
+            static_cast<unsigned>(USHRT_WIDTH));
+}
+
+TEST(LlvmLibcStdcCountZerosUsTest, Ones) {
+  for (unsigned i = 0U; i != USHRT_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_count_zeros_us(USHRT_MAX >> i), i);
+}
diff --git a/libc/test/src/stdbit/stdc_has_single_bit_uc_test.cpp b/libc/test/src/stdbit/stdc_has_single_bit_uc_test.cpp
new file mode 100644
index 000000000000..6212b1ec765a
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_has_single_bit_uc_test.cpp
@@ -0,0 +1,20 @@
+//===-- Unittests for stdc_has_single_bit_uc ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_has_single_bit_uc.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUcTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_uc(0U), false);
+}
+
+TEST(LlvmLibcStdcCountOnesUcTest, OneHot) {
+  for (unsigned i = 0U; i != UCHAR_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_uc(1U << i), true);
+}
diff --git a/libc/test/src/stdbit/stdc_has_single_bit_ui_test.cpp b/libc/test/src/stdbit/stdc_has_single_bit_ui_test.cpp
new file mode 100644
index 000000000000..2e00507aa025
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_has_single_bit_ui_test.cpp
@@ -0,0 +1,20 @@
+//===-- Unittests for stdc_has_single_bit_ui ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_has_single_bit_ui.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUiTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_ui(0U), false);
+}
+
+TEST(LlvmLibcStdcCountOnesUiTest, OneHot) {
+  for (unsigned i = 0U; i != UINT_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_ui(1U << i), true);
+}
diff --git a/libc/test/src/stdbit/stdc_has_single_bit_ul_test.cpp b/libc/test/src/stdbit/stdc_has_single_bit_ul_test.cpp
new file mode 100644
index 000000000000..8c0178998bbe
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_has_single_bit_ul_test.cpp
@@ -0,0 +1,20 @@
+//===-- Unittests for stdc_has_single_bit_ul ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_has_single_bit_ul.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUlTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_ul(0U), false);
+}
+
+TEST(LlvmLibcStdcCountOnesUlTest, OneHot) {
+  for (unsigned i = 0U; i != ULONG_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_ul(1UL << i), true);
+}
diff --git a/libc/test/src/stdbit/stdc_has_single_bit_ull_test.cpp b/libc/test/src/stdbit/stdc_has_single_bit_ull_test.cpp
new file mode 100644
index 000000000000..1d9f976b6d63
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_has_single_bit_ull_test.cpp
@@ -0,0 +1,20 @@
+//===-- Unittests for stdc_has_single_bit_ull -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_has_single_bit_ull.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUllTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_ull(0U), false);
+}
+
+TEST(LlvmLibcStdcCountOnesUllTest, OneHot) {
+  for (unsigned i = 0U; i != ULLONG_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_ull(1ULL << i), true);
+}
diff --git a/libc/test/src/stdbit/stdc_has_single_bit_us_test.cpp b/libc/test/src/stdbit/stdc_has_single_bit_us_test.cpp
new file mode 100644
index 000000000000..52c4de881044
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_has_single_bit_us_test.cpp
@@ -0,0 +1,20 @@
+//===-- Unittests for stdc_has_single_bit_us ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_has_single_bit_us.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUsTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_us(0U), false);
+}
+
+TEST(LlvmLibcStdcCountOnesUsTest, OneHot) {
+  for (unsigned i = 0U; i != USHRT_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_us(1U << i), true);
+}
diff --git a/libc/test/src/stdfix/CMakeLists.txt b/libc/test/src/stdfix/CMakeLists.txt
index b6e0256bb688..4140b5b29f3b 100644
--- a/libc/test/src/stdfix/CMakeLists.txt
+++ b/libc/test/src/stdfix/CMakeLists.txt
@@ -22,6 +22,28 @@ foreach(suffix IN ITEMS hr r lr hk k lk)
   )
 endforeach()
 
+foreach(suffix IN ITEMS uhr ur ulr uhk uk)
+  add_libc_test(
+    sqrt${suffix}_test
+    SUITE
+      libc-stdfix-tests
+    HDRS
+      SqrtTest.h
+    SRCS
+      sqrt${suffix}_test.cpp
+    COMPILE_OPTIONS
+      -O3
+      -ffixed-point
+    DEPENDS
+      libc.src.stdfix.sqrt${suffix}
+      libc.src.__support.CPP.bit
+      libc.src.__support.fixed_point.fx_rep
+      libc.src.__support.fixed_point.sqrt
+      libc.src.__support.FPUtil.basic_operations
+      libc.src.__support.FPUtil.sqrt
+  )
+endforeach()
+
 foreach(suffix IN ITEMS hr r lr hk k lk uhr ur ulr uhk uk ulk)
   add_libc_test(
     round${suffix}_test
diff --git a/libc/test/src/stdfix/SqrtTest.h b/libc/test/src/stdfix/SqrtTest.h
new file mode 100644
index 000000000000..628be0deb770
--- /dev/null
+++ b/libc/test/src/stdfix/SqrtTest.h
@@ -0,0 +1,66 @@
+//===-- Utility class to test fixed-point sqrt ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "test/UnitTest/Test.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/FPUtil/sqrt.h"
+#include "src/__support/fixed_point/fx_rep.h"
+#include "src/__support/fixed_point/sqrt.h"
+
+template <typename T> class SqrtTest : public LIBC_NAMESPACE::testing::Test {
+
+  using FXRep = LIBC_NAMESPACE::fixed_point::FXRep<T>;
+  static constexpr T zero = FXRep::ZERO();
+  static constexpr T min = FXRep::MIN();
+  static constexpr T max = FXRep::MAX();
+  static constexpr T half = static_cast<T>(0.5);
+  static constexpr T quarter = static_cast<T>(0.25);
+  static constexpr T one =
+      (FXRep::INTEGRAL_LEN > 0) ? static_cast<T>(1) : FXRep::MAX();
+  static constexpr T eps = FXRep::EPS();
+
+public:
+  typedef T (*SqrtFunc)(T);
+
+  void testSpecialNumbers(SqrtFunc func) {
+    EXPECT_EQ(zero, func(zero));
+    EXPECT_EQ(half, func(quarter));
+
+    if constexpr (FXRep::INTEGRAL_LEN) {
+      EXPECT_EQ(one, func(one));
+      EXPECT_EQ(static_cast<T>(2.0), func(static_cast<T>(4.0)));
+    }
+
+    using StorageType = typename FXRep::StorageType;
+
+    constexpr size_t COUNT = 255;
+    constexpr StorageType STEP =
+        ~StorageType(0) / static_cast<StorageType>(COUNT);
+    constexpr double ERR = 3.0 * static_cast<double>(eps);
+    StorageType x = 0;
+    for (size_t i = 0; i < COUNT; ++i, x += STEP) {
+      T v = LIBC_NAMESPACE::cpp::bit_cast<T>(x);
+      double v_d = static_cast<double>(v);
+      double errors = LIBC_NAMESPACE::fputil::abs(
+          static_cast<double>(func(v)) - LIBC_NAMESPACE::fputil::sqrt(v_d));
+      if (errors > ERR) {
+        // Print out the failure input and output.
+        EXPECT_EQ(v, zero);
+        EXPECT_EQ(func(v), zero);
+      }
+      ASSERT_TRUE(errors <= ERR);
+    }
+  }
+};
+
+#define LIST_SQRT_TESTS(T, func)                                               \
+  using LlvmLibcSqrtTest = SqrtTest<T>;                                        \
+  TEST_F(LlvmLibcSqrtTest, SpecialNumbers) { testSpecialNumbers(&func); }      \
+  static_assert(true, "Require semicolon.")
diff --git a/libc/test/src/stdfix/sqrtuhk_test.cpp b/libc/test/src/stdfix/sqrtuhk_test.cpp
new file mode 100644
index 000000000000..d6ff5385cab8
--- /dev/null
+++ b/libc/test/src/stdfix/sqrtuhk_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for sqrtuhk ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/stdfix/sqrtuhk.h"
+
+LIST_SQRT_TESTS(unsigned short accum, LIBC_NAMESPACE::sqrtuhk);
diff --git a/libc/test/src/stdfix/sqrtuhr_test.cpp b/libc/test/src/stdfix/sqrtuhr_test.cpp
new file mode 100644
index 000000000000..22f00a4231b3
--- /dev/null
+++ b/libc/test/src/stdfix/sqrtuhr_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for sqrtuhr ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/stdfix/sqrtuhr.h"
+
+LIST_SQRT_TESTS(unsigned short fract, LIBC_NAMESPACE::sqrtuhr);
diff --git a/libc/test/src/stdfix/sqrtuk_test.cpp b/libc/test/src/stdfix/sqrtuk_test.cpp
new file mode 100644
index 000000000000..5a3105de1e0b
--- /dev/null
+++ b/libc/test/src/stdfix/sqrtuk_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for sqrtuk ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/stdfix/sqrtuk.h"
+
+LIST_SQRT_TESTS(unsigned accum, LIBC_NAMESPACE::sqrtuk);
diff --git a/libc/test/src/stdfix/sqrtulr_test.cpp b/libc/test/src/stdfix/sqrtulr_test.cpp
new file mode 100644
index 000000000000..1be4e2b5e0a6
--- /dev/null
+++ b/libc/test/src/stdfix/sqrtulr_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for sqrtulr ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/stdfix/sqrtulr.h"
+
+LIST_SQRT_TESTS(unsigned long fract, LIBC_NAMESPACE::sqrtulr);
diff --git a/libc/test/src/stdfix/sqrtur_test.cpp b/libc/test/src/stdfix/sqrtur_test.cpp
new file mode 100644
index 000000000000..12b1c2211db0
--- /dev/null
+++ b/libc/test/src/stdfix/sqrtur_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for sqrtur ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/stdfix/sqrtur.h"
+
+LIST_SQRT_TESTS(unsigned fract, LIBC_NAMESPACE::sqrtur);
diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index 93c21aa994ef..6e1c86e070a8 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -121,6 +121,9 @@ endif()
 if(LIBC_CONF_PRINTF_DISABLE_WRITE_INT)
   list(APPEND sprintf_test_copts "-DLIBC_COPT_PRINTF_DISABLE_WRITE_INT")
 endif()
+if(LIBC_CONF_PRINTF_DISABLE_FIXED_POINT)
+  list(APPEND sprintf_test_copts "-DLIBC_COPT_PRINTF_DISABLE_FIXED_POINT")
+endif()
 
 add_fp_unittest(
   sprintf_test
diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp
index 186b37e2898a..b9f402027e7f 100644
--- a/libc/test/src/stdio/sprintf_test.cpp
+++ b/libc/test/src/stdio/sprintf_test.cpp
@@ -3201,6 +3201,217 @@ TEST_F(LlvmLibcSPrintfTest, FloatAutoLongDoubleConv) {
 
 #endif // LIBC_COPT_PRINTF_DISABLE_FLOAT
 
+#if defined(LIBC_COMPILER_HAS_FIXED_POINT) &&                                  \
+    !defined(LIBC_COPT_PRINTF_DISABLE_FIXED_POINT)
+TEST_F(LlvmLibcSPrintfTest, FixedConv) {
+
+  // These numeric tests are potentially a little weak, but the fuzz test is
+  // more thorough than my handwritten tests tend to be.
+
+  // TODO: Replace hex literals with their appropriate fixed point literals.
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%k", 0x0); // 0.0
+  ASSERT_STREQ_LEN(written, buff, "0.000000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%k", 0x80000000); // -0.0
+  ASSERT_STREQ_LEN(written, buff, "-0.000000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%r", 0xffff); // -fract max
+  ASSERT_STREQ_LEN(written, buff, "-0.999969");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%R", 0xffff); // unsigned fract max
+  ASSERT_STREQ_LEN(written, buff, "0.999985");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%k", 0xffffffff); // -accum max
+  ASSERT_STREQ_LEN(written, buff, "-65535.999969");
+
+  written =
+      LIBC_NAMESPACE::sprintf(buff, "%K", 0xffffffff); // unsigned accum max
+  ASSERT_STREQ_LEN(written, buff, "65535.999985");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%r", 0x7fff); // fract max
+  ASSERT_STREQ_LEN(written, buff, "0.999969");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%k", 0x7fffffff); // accum max
+  ASSERT_STREQ_LEN(written, buff, "65535.999969");
+
+  // Length Modifier Tests.
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%hk", 0x0); // 0.0
+  ASSERT_STREQ_LEN(written, buff, "0.000000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%hk", 0xffff); // -short accum max
+  ASSERT_STREQ_LEN(written, buff, "-255.992188");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%hr", 0x0); // 0.0
+  ASSERT_STREQ_LEN(written, buff, "0.000000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%hr", 0xff); // -short fract max
+  ASSERT_STREQ_LEN(written, buff, "-0.992188");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%hK", 0x0); // 0.0
+  ASSERT_STREQ_LEN(written, buff, "0.000000");
+
+  written =
+      LIBC_NAMESPACE::sprintf(buff, "%hK", 0xffff); // unsigned short accum max
+  ASSERT_STREQ_LEN(written, buff, "255.996094");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%hR", 0x0); // 0.0
+  ASSERT_STREQ_LEN(written, buff, "0.000000");
+
+  written =
+      LIBC_NAMESPACE::sprintf(buff, "%hR", 0xff); // unsigned short fract max
+  ASSERT_STREQ_LEN(written, buff, "0.996094");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%lk", 0x0); // 0.0
+  ASSERT_STREQ_LEN(written, buff, "0.000000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%lk",
+                                    0xffffffffffffffff); //-long accum max
+  ASSERT_STREQ_LEN(written, buff, "-4294967296.000000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%lr", 0x0); // 0.0
+  ASSERT_STREQ_LEN(written, buff, "0.000000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%lr",
+                                    0xffffffff); //-long fract max
+  ASSERT_STREQ_LEN(written, buff, "-1.000000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%lK", 0x0); // 0.0
+  ASSERT_STREQ_LEN(written, buff, "0.000000");
+
+  written =
+      LIBC_NAMESPACE::sprintf(buff, "%lK",
+                              0xffffffffffffffff); // unsigned long accum max
+  ASSERT_STREQ_LEN(written, buff, "4294967296.000000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%lR", 0x0); // 0.0
+  ASSERT_STREQ_LEN(written, buff, "0.000000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%lR",
+                                    0xffffffff); // unsigned long fract max
+  ASSERT_STREQ_LEN(written, buff, "1.000000");
+
+  // Min Width Tests.
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%10k", 0x0000a000); // 1.25
+  ASSERT_STREQ_LEN(written, buff, "  1.250000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%10k", 0x8000a000); //-1.25
+  ASSERT_STREQ_LEN(written, buff, " -1.250000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%8k", 0x0000a000); // 1.25
+  ASSERT_STREQ_LEN(written, buff, "1.250000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%9k", 0x8000a000); //-1.25
+  ASSERT_STREQ_LEN(written, buff, "-1.250000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%4k", 0x0000a000); // 1.25
+  ASSERT_STREQ_LEN(written, buff, "1.250000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%4k", 0x8000a000); //-1.25
+  ASSERT_STREQ_LEN(written, buff, "-1.250000");
+
+  // Precision Tests.
+
+  written =
+      LIBC_NAMESPACE::sprintf(buff, "%.16K", 0xFFFFFFFF); // unsigned accum max
+  ASSERT_STREQ_LEN(written, buff, "65535.9999847412109375");
+
+  written = LIBC_NAMESPACE::sprintf(
+      buff, "%.32lK", 0xFFFFFFFFFFFFFFFF); // unsigned long accum max
+  ASSERT_STREQ_LEN(written, buff,
+                   "4294967295.99999999976716935634613037109375");
+
+  written =
+      LIBC_NAMESPACE::sprintf(buff, "%.0K", 0xFFFFFFFF); // unsigned accum max
+  ASSERT_STREQ_LEN(written, buff, "65536");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%.0R", 0xFFFF); // unsigned fract max
+  ASSERT_STREQ_LEN(written, buff, "1");
+
+  // Flag Tests.
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%+k", 0x0000a000); // 1.25
+  ASSERT_STREQ_LEN(written, buff, "+1.250000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%+k", 0x8000a000); //-1.25
+  ASSERT_STREQ_LEN(written, buff, "-1.250000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "% k", 0x0000a000); // 1.25
+  ASSERT_STREQ_LEN(written, buff, " 1.250000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "% k", 0x8000a000); //-1.25
+  ASSERT_STREQ_LEN(written, buff, "-1.250000");
+
+  // unsigned variants ignore sign flags.
+  written = LIBC_NAMESPACE::sprintf(buff, "%+K", 0x00014000); // 1.25
+  ASSERT_STREQ_LEN(written, buff, "1.250000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "% K", 0x00014000); // 1.25
+  ASSERT_STREQ_LEN(written, buff, "1.250000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%-10k", 0x0000c000); // 1.5
+  ASSERT_STREQ_LEN(written, buff, "1.500000  ");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%#.k", 0x00008000); // 1.0
+  ASSERT_STREQ_LEN(written, buff, "1.");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%#.0k", 0x0000c000); // 1.5
+  ASSERT_STREQ_LEN(written, buff, "2.");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%010k", 0x0000c000); // 1.5
+  ASSERT_STREQ_LEN(written, buff, "001.500000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%010k", 0x8000c000); //-1.5
+  ASSERT_STREQ_LEN(written, buff, "-01.500000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%+- #0k", 0); // 0.0
+  ASSERT_STREQ_LEN(written, buff, "+0.000000");
+
+  // Combined Tests.
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%10.2k", 0x0004feb8); // 9.99
+  ASSERT_STREQ_LEN(written, buff, "      9.99");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%5.1k", 0x0004feb8); // 9.99
+  ASSERT_STREQ_LEN(written, buff, " 10.0");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%-10.2k", 0x0004feb8); // 9.99
+  ASSERT_STREQ_LEN(written, buff, "9.99      ");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%-5.1k", 0x0004feb8); // 9.99
+  ASSERT_STREQ_LEN(written, buff, "10.0 ");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%-5.1k", 0x00000001); // accum min
+  ASSERT_STREQ_LEN(written, buff, "0.0  ");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%30k", 0x7fffffff); // accum max
+  ASSERT_STREQ_LEN(written, buff, "                  65535.999969");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%-30k", 0x7fffffff); // accum max
+  ASSERT_STREQ_LEN(written, buff, "65535.999969                  ");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%20.2lK",
+                                    0x3b9ac9ffFD70A3D7); // 999999999.99
+  ASSERT_STREQ_LEN(written, buff, "        999999999.99");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%20.1lK",
+                                    0x3b9ac9ffFD70A3D7); // 999999999.99
+  ASSERT_STREQ_LEN(written, buff, "        1000000000.0");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%12.3R %-12.3k", 0x1999,
+                                    0x00800000); // 0.1, 256.0
+  ASSERT_STREQ_LEN(written, buff, "       0.100 256.000     ");
+
+  written =
+      LIBC_NAMESPACE::sprintf(buff, "%+-#12.3lk % 012.3k", 0x000000001013a92a,
+                              0x02740000); // 0.126, 1256.0
+  ASSERT_STREQ_LEN(written, buff, "+0.126        0001256.000");
+}
+#endif // defined(LIBC_COMPILER_HAS_FIXED_POINT) &&
+       // !defined(LIBC_COPT_PRINTF_DISABLE_FIXED_POINT)
+
 #ifndef LIBC_COPT_PRINTF_DISABLE_WRITE_INT
 TEST(LlvmLibcSPrintfTest, WriteIntConv) {
   char buff[64];
diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt
index 5826cfe8d4ca..5488a61c4ef1 100644
--- a/libc/test/src/stdlib/CMakeLists.txt
+++ b/libc/test/src/stdlib/CMakeLists.txt
@@ -54,20 +54,17 @@ add_libc_test(
     libc.src.stdlib.atoll
 )
 
-# This fails on NVPTX where the output value is one-off of the expected value.
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
-  add_fp_unittest(
-    strtod_test
-    SUITE
-      libc-stdlib-tests
-    SRCS
-      strtod_test.cpp
-    DEPENDS
-      libc.src.errno.errno
-      libc.src.stdlib.strtod
-      libc.src.__support.FPUtil.fenv_impl
-  )
-endif()
+add_fp_unittest(
+  strtod_test
+  SUITE
+    libc-stdlib-tests
+  SRCS
+    strtod_test.cpp
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.stdlib.strtod
+    libc.src.__support.FPUtil.fenv_impl
+)
 
 add_fp_unittest(
   strtof_test
@@ -126,20 +123,17 @@ add_libc_test(
     .strtol_test_support
 )
 
-# This fails on NVPTX where the output value is one-off of the expected value.
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
-  add_libc_test(
-    strtold_test
-    SUITE
-      libc-stdlib-tests
-    SRCS
-      strtold_test.cpp
-    DEPENDS
-      libc.src.errno.errno
-      libc.src.__support.uint128
-      libc.src.stdlib.strtold
-  )
-endif()
+add_libc_test(
+  strtold_test
+  SUITE
+    libc-stdlib-tests
+  SRCS
+    strtold_test.cpp
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.__support.uint128
+    libc.src.stdlib.strtold
+)
 
 add_libc_test(
   strtoll_test
diff --git a/libc/test/src/time/CMakeLists.txt b/libc/test/src/time/CMakeLists.txt
index ebb0998feb23..51cacef0a62f 100644
--- a/libc/test/src/time/CMakeLists.txt
+++ b/libc/test/src/time/CMakeLists.txt
@@ -102,21 +102,17 @@ add_libc_unittest(
     libc.src.__support.CPP.limits
 )
 
-# Sleeping is not supported on older NVPTX architectures.
-set(unsupported_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62")
-if (NOT ("${LIBC_GPU_TARGET_ARCHITECTURE}" IN_LIST unsupported_architectures))
-  add_libc_test(
-    nanosleep_test
-    SUITE
-      libc_time_unittests
-    SRCS
-      nanosleep_test.cpp
-    DEPENDS
-      libc.include.time
-      libc.src.time.nanosleep
-      libc.src.errno.errno
-  )
-endif()
+add_libc_test(
+  nanosleep_test
+  SUITE
+    libc_time_unittests
+  SRCS
+    nanosleep_test.cpp
+  DEPENDS
+    libc.include.time
+    libc.src.time.nanosleep
+    libc.src.errno.errno
+)
 
 add_libc_unittest(
   time_test
diff --git a/libc/utils/LibcTableGenUtil/CMakeLists.txt b/libc/utils/LibcTableGenUtil/CMakeLists.txt
index dca6a7bb8306..60208ed790d5 100644
--- a/libc/utils/LibcTableGenUtil/CMakeLists.txt
+++ b/libc/utils/LibcTableGenUtil/CMakeLists.txt
@@ -5,5 +5,5 @@ add_llvm_library(
   DISABLE_LLVM_LINK_LLVM_DYLIB
   LINK_COMPONENTS Support TableGen
 )
-target_include_directories(LibcTableGenUtil PUBLIC ${LIBC_SOURCE_DIR})
+target_include_directories(LibcTableGenUtil PUBLIC ${LIBC_SOURCE_DIR} ${LIBC_SOURCE_DIR}/include)
 target_include_directories(LibcTableGenUtil PRIVATE ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR})
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index e2aabb08c11d..d74d65e89938 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -11,7 +11,7 @@
 
 #include "utils/gpu/server/llvmlibc_rpc_server.h"
 
-#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+#include "llvm-libc-types/test_rpc_opcodes_t.h"
 
 #include <cstddef>
 #include <cstdint>
diff --git a/libcxx/cmake/caches/AndroidNDK.cmake b/libcxx/cmake/caches/AndroidNDK.cmake
index 23fbc3fa6bdd..298518781e9b 100644
--- a/libcxx/cmake/caches/AndroidNDK.cmake
+++ b/libcxx/cmake/caches/AndroidNDK.cmake
@@ -35,7 +35,3 @@ set(CMAKE_CXX_COMPILER_WORKS ON CACHE BOOL "")
 # them.
 set(LIBCXX_TEST_CONFIG "llvm-libc++-android-ndk.cfg.in" CACHE STRING "")
 set(LIBCXXABI_TEST_CONFIG "llvm-libc++abi-android-ndk.cfg.in" CACHE STRING "")
-
-# CMAKE_SOURCE_DIR refers to the "<monorepo>/runtimes" directory.
-set(LIBCXX_EXECUTOR "${CMAKE_SOURCE_DIR}/../libcxx/utils/adb_run.py" CACHE STRING "")
-set(LIBCXXABI_EXECUTOR "${LIBCXX_EXECUTOR}" CACHE STRING "")
diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index a7f108c44e71..78c6bb87a5a4 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -51,8 +51,6 @@ Improvements and New Features
 Deprecations and Removals
 -------------------------
 
-- TODO: The ``LIBCXX_EXECUTOR`` CMake variables have been removed.
-
 - TODO: The ``LIBCXX_ENABLE_ASSERTIONS`` CMake variable that was used to enable the safe mode has been deprecated and setting
   it triggers an error; use the ``LIBCXX_HARDENING_MODE`` CMake variable with the value ``extensive`` instead. Similarly,
   the ``_LIBCPP_ENABLE_ASSERTIONS`` macro has been deprecated (setting it to ``1`` still enables the extensive mode in
@@ -75,6 +73,11 @@ Deprecations and Removals
 
 - The ``_LIBCPP_INLINE_VISIBILITY`` and ``_VSTD`` macros have been removed in LLVM 19.
 
+- The ``_LIBCPP_ATOMIC_ONLY_USE_BUILTINS`` configuration option has been removed in LLVM 19. This should not affect
+  many users, except perhaps users using the library with ``-ffreestanding`` with a toolchain where compiler-rt or
+  libatomic is not available. If you are one such user, please reach out to the libc++ developers so we can collaborate
+  on a path for supporting atomics properly on freestanding platforms.
+
 
 Upcoming Deprecations and Removals
 ----------------------------------
@@ -98,4 +101,5 @@ TODO
 Build System Changes
 --------------------
 
-TODO
+- The ``LIBCXX_EXECUTOR`` and ``LIBCXXABI_EXECUTOR`` CMake variables have been removed. Please
+  set ``LIBCXX_TEST_PARAMS`` to ``executor=<...>`` instead.
diff --git a/libcxx/include/__atomic/aliases.h b/libcxx/include/__atomic/aliases.h
index 0fa289de54b0..db34f5ec02d7 100644
--- a/libcxx/include/__atomic/aliases.h
+++ b/libcxx/include/__atomic/aliases.h
@@ -92,7 +92,7 @@ using __largest_lock_free_type = short;
 #  elif ATOMIC_CHAR_LOCK_FREE == 2
 using __largest_lock_free_type = char;
 #  else
-#    define _LIBCPP_NO_LOCK_FREE_TYPES // There are no lockfree types (this can happen in freestanding)
+#    define _LIBCPP_NO_LOCK_FREE_TYPES // There are no lockfree types (this can happen on unusual platforms)
 #  endif
 
 #  ifndef _LIBCPP_NO_LOCK_FREE_TYPES
diff --git a/libcxx/include/__atomic/cxx_atomic_impl.h b/libcxx/include/__atomic/cxx_atomic_impl.h
index 1a0b808a0cb1..b900cc135f78 100644
--- a/libcxx/include/__atomic/cxx_atomic_impl.h
+++ b/libcxx/include/__atomic/cxx_atomic_impl.h
@@ -9,16 +9,13 @@
 #ifndef _LIBCPP___ATOMIC_CXX_ATOMIC_IMPL_H
 #define _LIBCPP___ATOMIC_CXX_ATOMIC_IMPL_H
 
-#include <__atomic/is_always_lock_free.h>
 #include <__atomic/memory_order.h>
 #include <__config>
 #include <__memory/addressof.h>
-#include <__type_traits/conditional.h>
 #include <__type_traits/is_assignable.h>
 #include <__type_traits/is_trivially_copyable.h>
 #include <__type_traits/remove_const.h>
 #include <cstddef>
-#include <cstring>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -26,7 +23,7 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#if defined(_LIBCPP_HAS_GCC_ATOMIC_IMP) || defined(_LIBCPP_ATOMIC_ONLY_USE_BUILTINS)
+#if defined(_LIBCPP_HAS_GCC_ATOMIC_IMP)
 
 // [atomics.types.generic]p1 guarantees _Tp is trivially copyable. Because
 // the default operator= in an object is not volatile, a byte-by-byte copy
@@ -44,10 +41,6 @@ _LIBCPP_HIDE_FROM_ABI void __cxx_atomic_assign_volatile(_Tp volatile& __a_value,
     *__to++ = *__from++;
 }
 
-#endif
-
-#if defined(_LIBCPP_HAS_GCC_ATOMIC_IMP)
-
 template <typename _Tp>
 struct __cxx_atomic_base_impl {
   _LIBCPP_HIDE_FROM_ABI
@@ -529,289 +522,7 @@ __cxx_atomic_fetch_xor(__cxx_atomic_base_impl<_Tp>* __a, _Tp __pattern, memory_o
 
 #endif // _LIBCPP_HAS_GCC_ATOMIC_IMP, _LIBCPP_HAS_C_ATOMIC_IMP
 
-#ifdef _LIBCPP_ATOMIC_ONLY_USE_BUILTINS
-
-template <typename _Tp>
-struct __cxx_atomic_lock_impl {
-  _LIBCPP_HIDE_FROM_ABI __cxx_atomic_lock_impl() _NOEXCEPT : __a_value(), __a_lock(0) {}
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __cxx_atomic_lock_impl(_Tp value) _NOEXCEPT
-      : __a_value(value),
-        __a_lock(0) {}
-
-  _Tp __a_value;
-  mutable __cxx_atomic_base_impl<_LIBCPP_ATOMIC_FLAG_TYPE> __a_lock;
-
-  _LIBCPP_HIDE_FROM_ABI void __lock() const volatile {
-    while (1 == __cxx_atomic_exchange(&__a_lock, _LIBCPP_ATOMIC_FLAG_TYPE(true), memory_order_acquire))
-      /*spin*/;
-  }
-  _LIBCPP_HIDE_FROM_ABI void __lock() const {
-    while (1 == __cxx_atomic_exchange(&__a_lock, _LIBCPP_ATOMIC_FLAG_TYPE(true), memory_order_acquire))
-      /*spin*/;
-  }
-  _LIBCPP_HIDE_FROM_ABI void __unlock() const volatile {
-    __cxx_atomic_store(&__a_lock, _LIBCPP_ATOMIC_FLAG_TYPE(false), memory_order_release);
-  }
-  _LIBCPP_HIDE_FROM_ABI void __unlock() const {
-    __cxx_atomic_store(&__a_lock, _LIBCPP_ATOMIC_FLAG_TYPE(false), memory_order_release);
-  }
-  _LIBCPP_HIDE_FROM_ABI _Tp __read() const volatile {
-    __lock();
-    _Tp __old;
-    __cxx_atomic_assign_volatile(__old, __a_value);
-    __unlock();
-    return __old;
-  }
-  _LIBCPP_HIDE_FROM_ABI _Tp __read() const {
-    __lock();
-    _Tp __old = __a_value;
-    __unlock();
-    return __old;
-  }
-  _LIBCPP_HIDE_FROM_ABI void __read_inplace(_Tp* __dst) const volatile {
-    __lock();
-    __cxx_atomic_assign_volatile(*__dst, __a_value);
-    __unlock();
-  }
-  _LIBCPP_HIDE_FROM_ABI void __read_inplace(_Tp* __dst) const {
-    __lock();
-    *__dst = __a_value;
-    __unlock();
-  }
-};
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI void __cxx_atomic_init(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp __val) {
-  __cxx_atomic_assign_volatile(__a->__a_value, __val);
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI void __cxx_atomic_init(__cxx_atomic_lock_impl<_Tp>* __a, _Tp __val) {
-  __a->__a_value = __val;
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI void __cxx_atomic_store(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp __val, memory_order) {
-  __a->__lock();
-  __cxx_atomic_assign_volatile(__a->__a_value, __val);
-  __a->__unlock();
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI void __cxx_atomic_store(__cxx_atomic_lock_impl<_Tp>* __a, _Tp __val, memory_order) {
-  __a->__lock();
-  __a->__a_value = __val;
-  __a->__unlock();
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_load(const volatile __cxx_atomic_lock_impl<_Tp>* __a, memory_order) {
-  return __a->__read();
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_load(const __cxx_atomic_lock_impl<_Tp>* __a, memory_order) {
-  return __a->__read();
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI void
-__cxx_atomic_load(const volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __dst, memory_order) {
-  __a->__read_inplace(__dst);
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI void __cxx_atomic_load(const __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __dst, memory_order) {
-  __a->__read_inplace(__dst);
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_exchange(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp __value, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, __value);
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_exchange(__cxx_atomic_lock_impl<_Tp>* __a, _Tp __value, memory_order) {
-  __a->__lock();
-  _Tp __old      = __a->__a_value;
-  __a->__a_value = __value;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI bool __cxx_atomic_compare_exchange_strong(
-    volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order) {
-  _Tp __temp;
-  __a->__lock();
-  __cxx_atomic_assign_volatile(__temp, __a->__a_value);
-  bool __ret = (std::memcmp(&__temp, __expected, sizeof(_Tp)) == 0);
-  if (__ret)
-    __cxx_atomic_assign_volatile(__a->__a_value, __value);
-  else
-    __cxx_atomic_assign_volatile(*__expected, __a->__a_value);
-  __a->__unlock();
-  return __ret;
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI bool __cxx_atomic_compare_exchange_strong(
-    __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order) {
-  __a->__lock();
-  bool __ret = (std::memcmp(&__a->__a_value, __expected, sizeof(_Tp)) == 0);
-  if (__ret)
-    std::memcpy(&__a->__a_value, &__value, sizeof(_Tp));
-  else
-    std::memcpy(__expected, &__a->__a_value, sizeof(_Tp));
-  __a->__unlock();
-  return __ret;
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI bool __cxx_atomic_compare_exchange_weak(
-    volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order) {
-  _Tp __temp;
-  __a->__lock();
-  __cxx_atomic_assign_volatile(__temp, __a->__a_value);
-  bool __ret = (std::memcmp(&__temp, __expected, sizeof(_Tp)) == 0);
-  if (__ret)
-    __cxx_atomic_assign_volatile(__a->__a_value, __value);
-  else
-    __cxx_atomic_assign_volatile(*__expected, __a->__a_value);
-  __a->__unlock();
-  return __ret;
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI bool __cxx_atomic_compare_exchange_weak(
-    __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order) {
-  __a->__lock();
-  bool __ret = (std::memcmp(&__a->__a_value, __expected, sizeof(_Tp)) == 0);
-  if (__ret)
-    std::memcpy(&__a->__a_value, &__value, sizeof(_Tp));
-  else
-    std::memcpy(__expected, &__a->__a_value, sizeof(_Tp));
-  __a->__unlock();
-  return __ret;
-}
-
-template <typename _Tp, typename _Td>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Td __delta, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old + __delta));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, typename _Td>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp>* __a, _Td __delta, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value += __delta;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp, typename _Td>
-_LIBCPP_HIDE_FROM_ABI _Tp*
-__cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp*>* __a, ptrdiff_t __delta, memory_order) {
-  __a->__lock();
-  _Tp* __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, __old + __delta);
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, typename _Td>
-_LIBCPP_HIDE_FROM_ABI _Tp* __cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp*>* __a, ptrdiff_t __delta, memory_order) {
-  __a->__lock();
-  _Tp* __old = __a->__a_value;
-  __a->__a_value += __delta;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp, typename _Td>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_sub(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Td __delta, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old - __delta));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, typename _Td>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_sub(__cxx_atomic_lock_impl<_Tp>* __a, _Td __delta, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value -= __delta;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp
-__cxx_atomic_fetch_and(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old & __pattern));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_and(__cxx_atomic_lock_impl<_Tp>* __a, _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value &= __pattern;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp
-__cxx_atomic_fetch_or(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old | __pattern));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_or(__cxx_atomic_lock_impl<_Tp>* __a, _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value |= __pattern;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp
-__cxx_atomic_fetch_xor(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old ^ __pattern));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_xor(__cxx_atomic_lock_impl<_Tp>* __a, _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value ^= __pattern;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp,
-          typename _Base = typename conditional<__libcpp_is_always_lock_free<_Tp>::__value,
-                                                __cxx_atomic_base_impl<_Tp>,
-                                                __cxx_atomic_lock_impl<_Tp> >::type>
-#else
 template <typename _Tp, typename _Base = __cxx_atomic_base_impl<_Tp> >
-#endif //_LIBCPP_ATOMIC_ONLY_USE_BUILTINS
 struct __cxx_atomic_impl : public _Base {
   static_assert(is_trivially_copyable<_Tp>::value, "std::atomic<T> requires that 'T' be a trivially copyable type");
 
diff --git a/libcxx/include/__availability b/libcxx/include/__availability
index c5069a027750..78438c55a3b7 100644
--- a/libcxx/include/__availability
+++ b/libcxx/include/__availability
@@ -101,12 +101,6 @@
 #  define _LIBCPP_AVAILABILITY_HAS_BAD_ANY_CAST 1
 #  define _LIBCPP_AVAILABILITY_BAD_ANY_CAST
 
-// These macros controls the availability of __cxa_init_primary_exception
-// in the built library, which std::make_exception_ptr might use
-// (see libcxx/include/__exception/exception_ptr.h).
-#  define _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION 1
-#  define _LIBCPP_AVAILABILITY_INIT_PRIMARY_EXCEPTION
-
 // These macros control the availability of all parts of <filesystem> that
 // depend on something in the dylib.
 #  define _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY 1
@@ -114,11 +108,6 @@
 #  define _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY_PUSH
 #  define _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY_POP
 
-// This controls the availability of floating-point std::to_chars functions.
-// These overloads were added later than the integer overloads.
-#  define _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT 1
-#  define _LIBCPP_AVAILABILITY_TO_CHARS_FLOATING_POINT
-
 // This controls the availability of the C++20 synchronization library,
 // which requires shared library support for various operations
 // (see libcxx/src/atomic.cpp). This includes <barier>, <latch>,
@@ -126,6 +115,24 @@
 #  define _LIBCPP_AVAILABILITY_HAS_SYNC 1
 #  define _LIBCPP_AVAILABILITY_SYNC
 
+// Enable additional explicit instantiations of iostreams components. This
+// reduces the number of weak definitions generated in programs that use
+// iostreams by providing a single strong definition in the shared library.
+//
+// TODO: Enable additional explicit instantiations on GCC once it supports exclude_from_explicit_instantiation,
+//       or once libc++ doesn't use the attribute anymore.
+// TODO: Enable them on Windows once https://llvm.org/PR41018 has been fixed.
+#  if !defined(_LIBCPP_COMPILER_GCC) && !defined(_WIN32)
+#    define _LIBCPP_AVAILABILITY_HAS_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1 1
+#  else
+#    define _LIBCPP_AVAILABILITY_HAS_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1 0
+#  endif
+
+// This controls the availability of floating-point std::to_chars functions.
+// These overloads were added later than the integer overloads.
+#  define _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT 1
+#  define _LIBCPP_AVAILABILITY_TO_CHARS_FLOATING_POINT
+
 // This controls whether the library claims to provide a default verbose
 // termination function, and consequently whether the headers will try
 // to use it when the mechanism isn't overriden at compile-time.
@@ -137,10 +144,11 @@
 #  define _LIBCPP_AVAILABILITY_HAS_PMR 1
 #  define _LIBCPP_AVAILABILITY_PMR
 
-// This controls the availability of the C++20 time zone database.
-// The parser code is built in the library.
-#  define _LIBCPP_AVAILABILITY_HAS_TZDB 1
-#  define _LIBCPP_AVAILABILITY_TZDB
+// These macros controls the availability of __cxa_init_primary_exception
+// in the built library, which std::make_exception_ptr might use
+// (see libcxx/include/__exception/exception_ptr.h).
+#  define _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION 1
+#  define _LIBCPP_AVAILABILITY_INIT_PRIMARY_EXCEPTION
 
 // This controls the availability of C++23 <print>, which
 // has a dependency on the built library (it needs access to
@@ -148,18 +156,10 @@
 #  define _LIBCPP_AVAILABILITY_HAS_PRINT 1
 #  define _LIBCPP_AVAILABILITY_PRINT
 
-// Enable additional explicit instantiations of iostreams components. This
-// reduces the number of weak definitions generated in programs that use
-// iostreams by providing a single strong definition in the shared library.
-//
-// TODO: Enable additional explicit instantiations on GCC once it supports exclude_from_explicit_instantiation,
-//       or once libc++ doesn't use the attribute anymore.
-// TODO: Enable them on Windows once https://llvm.org/PR41018 has been fixed.
-#  if !defined(_LIBCPP_COMPILER_GCC) && !defined(_WIN32)
-#    define _LIBCPP_AVAILABILITY_HAS_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1 1
-#  else
-#    define _LIBCPP_AVAILABILITY_HAS_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1 0
-#  endif
+// This controls the availability of the C++20 time zone database.
+// The parser code is built in the library.
+#  define _LIBCPP_AVAILABILITY_HAS_TZDB 1
+#  define _LIBCPP_AVAILABILITY_TZDB
 
 #elif defined(__APPLE__)
 
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 0797880cb2f5..942bbe7cbb93 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -1200,9 +1200,6 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c
 #    ifndef _LIBCPP_ATOMIC_FLAG_TYPE
 #      define _LIBCPP_ATOMIC_FLAG_TYPE bool
 #    endif
-#    ifdef _LIBCPP_FREESTANDING
-#      define _LIBCPP_ATOMIC_ONLY_USE_BUILTINS
-#    endif
 #  endif
 
 #  if defined(__FreeBSD__) && defined(__clang__) && __has_attribute(__no_thread_safety_analysis__)
diff --git a/libcxx/include/scoped_allocator b/libcxx/include/scoped_allocator
index eff6fbdf6edd..fa6c6c5d20d8 100644
--- a/libcxx/include/scoped_allocator
+++ b/libcxx/include/scoped_allocator
@@ -334,12 +334,12 @@ struct __outermost<_Alloc, true> {
 template <class _OuterAlloc, class... _InnerAllocs>
 class _LIBCPP_TEMPLATE_VIS scoped_allocator_adaptor<_OuterAlloc, _InnerAllocs...>
     : public __scoped_allocator_storage<_OuterAlloc, _InnerAllocs...> {
-  typedef __scoped_allocator_storage<_OuterAlloc, _InnerAllocs...> base;
+  typedef __scoped_allocator_storage<_OuterAlloc, _InnerAllocs...> _Base;
   typedef allocator_traits<_OuterAlloc> _OuterTraits;
 
 public:
   typedef _OuterAlloc outer_allocator_type;
-  typedef typename base::inner_allocator_type inner_allocator_type;
+  typedef typename _Base::inner_allocator_type inner_allocator_type;
   typedef typename _OuterTraits::size_type size_type;
   typedef typename _OuterTraits::difference_type difference_type;
   typedef typename _OuterTraits::pointer pointer;
@@ -365,29 +365,29 @@ public:
   template <class _OuterA2, __enable_if_t<is_constructible<outer_allocator_type, _OuterA2>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI
   scoped_allocator_adaptor(_OuterA2&& __outer_alloc, const _InnerAllocs&... __inner_allocs) _NOEXCEPT
-      : base(std::forward<_OuterA2>(__outer_alloc), __inner_allocs...) {}
+      : _Base(std::forward<_OuterA2>(__outer_alloc), __inner_allocs...) {}
   // scoped_allocator_adaptor(const scoped_allocator_adaptor& __other) = default;
   template <class _OuterA2, __enable_if_t<is_constructible<outer_allocator_type, const _OuterA2&>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI
   scoped_allocator_adaptor(const scoped_allocator_adaptor<_OuterA2, _InnerAllocs...>& __other) _NOEXCEPT
-      : base(__other) {}
+      : _Base(__other) {}
   template <class _OuterA2, __enable_if_t<is_constructible<outer_allocator_type, _OuterA2>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI
   scoped_allocator_adaptor(scoped_allocator_adaptor<_OuterA2, _InnerAllocs...>&& __other) _NOEXCEPT
-      : base(std::move(__other)) {}
+      : _Base(std::move(__other)) {}
 
   // scoped_allocator_adaptor& operator=(const scoped_allocator_adaptor&) = default;
   // scoped_allocator_adaptor& operator=(scoped_allocator_adaptor&&) = default;
   // ~scoped_allocator_adaptor() = default;
 
-  _LIBCPP_HIDE_FROM_ABI inner_allocator_type& inner_allocator() _NOEXCEPT { return base::inner_allocator(); }
+  _LIBCPP_HIDE_FROM_ABI inner_allocator_type& inner_allocator() _NOEXCEPT { return _Base::inner_allocator(); }
   _LIBCPP_HIDE_FROM_ABI const inner_allocator_type& inner_allocator() const _NOEXCEPT {
-    return base::inner_allocator();
+    return _Base::inner_allocator();
   }
 
-  _LIBCPP_HIDE_FROM_ABI outer_allocator_type& outer_allocator() _NOEXCEPT { return base::outer_allocator(); }
+  _LIBCPP_HIDE_FROM_ABI outer_allocator_type& outer_allocator() _NOEXCEPT { return _Base::outer_allocator(); }
   _LIBCPP_HIDE_FROM_ABI const outer_allocator_type& outer_allocator() const _NOEXCEPT {
-    return base::outer_allocator();
+    return _Base::outer_allocator();
   }
 
   _LIBCPP_NODISCARD_AFTER_CXX17 _LIBCPP_HIDE_FROM_ABI pointer allocate(size_type __n) {
@@ -472,12 +472,12 @@ public:
   }
 
   _LIBCPP_HIDE_FROM_ABI scoped_allocator_adaptor select_on_container_copy_construction() const _NOEXCEPT {
-    return base::select_on_container_copy_construction();
+    return _Base::select_on_container_copy_construction();
   }
 
 private:
   _LIBCPP_HIDE_FROM_ABI explicit scoped_allocator_adaptor(
-      outer_allocator_type&& __o, inner_allocator_type&& __i) _NOEXCEPT : base(std::move(__o), std::move(__i)) {}
+      outer_allocator_type&& __o, inner_allocator_type&& __i) _NOEXCEPT : _Base(std::move(__o), std::move(__i)) {}
 
   template <class _Tp, class... _Args>
   _LIBCPP_HIDE_FROM_ABI void __construct(integral_constant<int, 0>, _Tp* __p, _Args&&... __args) {
diff --git a/libcxx/lib/abi/CHANGELOG.TXT b/libcxx/lib/abi/CHANGELOG.TXT
index 7ff604959f4d..7a8d5052a083 100644
--- a/libcxx/lib/abi/CHANGELOG.TXT
+++ b/libcxx/lib/abi/CHANGELOG.TXT
@@ -13,6 +13,71 @@ To generate a summary, re-generate the new ABI list using the
 New entries should be added directly below the "Version" header.
 
 ------------
+Version 19.0
+------------
+
+* [libc++] Always keep libc++abi re-exports up-to-date
+
+  This patch makes sure that the set of libc++abi symbols re-exported from libc++
+  is up-to-date with the symbols that libc++abi re-exports. As such, it adds several
+  symbols that were left out of the libc++ re-exports list. Exporting new symbols
+  is not an ABI break.
+
+  <arch>-apple-darwin
+  -------------------
+  Symbol reexported: ___cxa_current_primary_exception
+  Symbol reexported: ___cxa_decrement_exception_refcount
+  Symbol reexported: ___cxa_increment_exception_refcount
+  Symbol reexported: ___cxa_new_handler
+  Symbol reexported: ___cxa_rethrow_primary_exception
+  Symbol reexported: ___cxa_terminate_handler
+  Symbol reexported: ___cxa_uncaught_exception
+  Symbol reexported: ___cxa_unexpected_handler
+  Symbol reexported: __ZTIDh
+  Symbol reexported: __ZTIDu
+  Symbol reexported: __ZTIg
+  Symbol reexported: __ZTIn
+  Symbol reexported: __ZTIN10__cxxabiv116__enum_type_infoE
+  Symbol reexported: __ZTIN10__cxxabiv116__shim_type_infoE
+  Symbol reexported: __ZTIN10__cxxabiv117__array_type_infoE
+  Symbol reexported: __ZTIN10__cxxabiv117__class_type_infoE
+  Symbol reexported: __ZTIN10__cxxabiv117__pbase_type_infoE
+  Symbol reexported: __ZTIN10__cxxabiv119__pointer_type_infoE
+  Symbol reexported: __ZTIN10__cxxabiv120__function_type_infoE
+  Symbol reexported: __ZTIN10__cxxabiv120__si_class_type_infoE
+  Symbol reexported: __ZTIN10__cxxabiv121__vmi_class_type_infoE
+  Symbol reexported: __ZTIN10__cxxabiv123__fundamental_type_infoE
+  Symbol reexported: __ZTIN10__cxxabiv129__pointer_to_member_type_infoE
+  Symbol reexported: __ZTIo
+  Symbol reexported: __ZTIPDh
+  Symbol reexported: __ZTIPDu
+  Symbol reexported: __ZTIPg
+  Symbol reexported: __ZTIPKDh
+  Symbol reexported: __ZTIPKDu
+  Symbol reexported: __ZTIPKg
+  Symbol reexported: __ZTIPKn
+  Symbol reexported: __ZTIPKo
+  Symbol reexported: __ZTIPn
+  Symbol reexported: __ZTIPo
+  Symbol reexported: __ZTSDh
+  Symbol reexported: __ZTSDu
+  Symbol reexported: __ZTSg
+  Symbol reexported: __ZTSn
+  Symbol reexported: __ZTSN10__cxxabiv116__shim_type_infoE
+  Symbol reexported: __ZTSo
+  Symbol reexported: __ZTSPDh
+  Symbol reexported: __ZTSPDu
+  Symbol reexported: __ZTSPg
+  Symbol reexported: __ZTSPKDh
+  Symbol reexported: __ZTSPKDu
+  Symbol reexported: __ZTSPKg
+  Symbol reexported: __ZTSPKn
+  Symbol reexported: __ZTSPKo
+  Symbol reexported: __ZTSPn
+  Symbol reexported: __ZTSPo
+  Symbol reexported: __ZTVN10__cxxabiv116__shim_type_infoE
+
+------------
 Version 18.0
 ------------
 
diff --git a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
index 2064f45bf8c0..46353986f5d7 100644
--- a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -70,26 +70,46 @@
 {'is_defined': False, 'name': '__ZSt15get_new_handlerv', 'type': 'U'}
 {'is_defined': False, 'name': '__ZSt15set_new_handlerPFvvE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZSt9terminatev', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIDh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIDi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIDn', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIDs', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIDu', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv116__enum_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv116__shim_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv117__array_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv117__class_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv117__pbase_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv119__pointer_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv120__function_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv120__si_class_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv121__vmi_class_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv123__fundamental_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv129__pointer_to_member_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPDh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPDi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPDn', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPDs', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPDu', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPKDh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKDi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKDn', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKDs', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPKDu', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKa', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKb', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKc', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKd', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKe', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKf', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPKg', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKj', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKl', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKm', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPKn', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPKo', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKs', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKt', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKv', 'type': 'U'}
@@ -102,11 +122,14 @@
 {'is_defined': False, 'name': '__ZTIPd', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPe', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPf', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPg', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPj', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPl', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPm', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPn', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPo', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPs', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPt', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPv', 'type': 'U'}
@@ -135,21 +158,27 @@
 {'is_defined': False, 'name': '__ZTId', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIe', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIf', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIg', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIj', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIl', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIm', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIn', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIo', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIs', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIt', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIv', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIw', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIx', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIy', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSDh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSDi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSDn', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSDs', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSDu', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv116__enum_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSN10__cxxabiv116__shim_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv117__array_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv117__class_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv117__pbase_type_infoE', 'type': 'U'}
@@ -159,23 +188,30 @@
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv121__vmi_class_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv123__fundamental_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv129__pointer_to_member_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPDh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPDi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPDn', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPDs', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPDu', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPKDh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKDi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKDn', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKDs', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPKDu', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKa', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKb', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKc', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKd', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKe', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKf', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPKg', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKj', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKl', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKm', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPKn', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPKo', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKs', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKt', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKv', 'type': 'U'}
@@ -188,11 +224,14 @@
 {'is_defined': False, 'name': '__ZTSPd', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPe', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPf', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPg', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPj', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPl', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPm', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPn', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPo', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPs', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPt', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPv', 'type': 'U'}
@@ -221,11 +260,14 @@
 {'is_defined': False, 'name': '__ZTSd', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSe', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSf', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSg', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSj', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSl', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSm', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSn', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSo', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSs', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSt', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSv', 'type': 'U'}
@@ -233,6 +275,7 @@
 {'is_defined': False, 'name': '__ZTSx', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSy', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTVN10__cxxabiv116__enum_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTVN10__cxxabiv116__shim_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTVN10__cxxabiv117__array_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTVN10__cxxabiv117__class_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTVN10__cxxabiv117__pbase_type_infoE', 'type': 'U'}
@@ -301,12 +344,16 @@
 {'is_defined': False, 'name': '___cxa_guard_release', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_increment_exception_refcount', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_init_primary_exception', 'type': 'U'}
+{'is_defined': False, 'name': '___cxa_new_handler', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_pure_virtual', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_rethrow', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_rethrow_primary_exception', 'type': 'U'}
+{'is_defined': False, 'name': '___cxa_terminate_handler', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_throw', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_throw_bad_array_new_length', 'type': 'U'}
+{'is_defined': False, 'name': '___cxa_uncaught_exception', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_uncaught_exceptions', 'type': 'U'}
+{'is_defined': False, 'name': '___cxa_unexpected_handler', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_vec_cctor', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_vec_cleanup', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_vec_ctor', 'type': 'U'}
@@ -1945,9 +1992,22 @@
 {'is_defined': True, 'name': '__ZTCNSt3__19strstreamE0_NS_13basic_istreamIcNS_11char_traitsIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTCNSt3__19strstreamE0_NS_14basic_iostreamIcNS_11char_traitsIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTCNSt3__19strstreamE16_NS_13basic_ostreamIcNS_11char_traitsIcEEEE', 'size': 0, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '__ZTIDh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIDi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIDn', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIDs', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIDu', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv116__enum_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv116__shim_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv117__array_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv117__class_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv117__pbase_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv119__pointer_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv120__function_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv120__si_class_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv121__vmi_class_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv123__fundamental_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv129__pointer_to_member_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTINSt12experimental15fundamentals_v112bad_any_castE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt12experimental19bad_optional_accessE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt3__110istrstreamE', 'size': 0, 'type': 'OBJECT'}
@@ -2055,23 +2115,30 @@
 {'is_defined': True, 'name': '__ZTINSt3__19money_putIcNS_19ostreambuf_iteratorIcNS_11char_traitsIcEEEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt3__19money_putIwNS_19ostreambuf_iteratorIwNS_11char_traitsIwEEEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt3__19strstreamE', 'size': 0, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '__ZTIPDh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPDi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPDn', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPDs', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPDu', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPKDh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKDi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKDn', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKDs', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPKDu', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKa', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKb', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKc', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKd', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKe', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKf', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPKg', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKj', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKl', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKm', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPKn', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPKo', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKs', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKt', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKv', 'type': 'I'}
@@ -2084,11 +2151,14 @@
 {'is_defined': True, 'name': '__ZTIPd', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPe', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPf', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPg', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPj', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPl', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPm', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPn', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPo', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPs', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPt', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPv', 'type': 'I'}
@@ -2121,21 +2191,27 @@
 {'is_defined': True, 'name': '__ZTId', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIe', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIf', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIg', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIj', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIl', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIm', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIn', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIo', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIs', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIt', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIv', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIw', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIx', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIy', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSDh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSDi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSDn', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSDs', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSDu', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSN10__cxxabiv116__enum_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSN10__cxxabiv116__shim_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSN10__cxxabiv117__array_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSN10__cxxabiv117__class_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSN10__cxxabiv117__pbase_type_infoE', 'type': 'I'}
@@ -2227,23 +2303,30 @@
 {'is_defined': True, 'name': '__ZTSNSt3__19money_putIcNS_19ostreambuf_iteratorIcNS_11char_traitsIcEEEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTSNSt3__19money_putIwNS_19ostreambuf_iteratorIwNS_11char_traitsIwEEEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTSNSt3__19strstreamE', 'size': 0, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '__ZTSPDh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPDi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPDn', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPDs', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPDu', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPKDh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKDi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKDn', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKDs', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPKDu', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKa', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKb', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKc', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKd', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKe', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKf', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPKg', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKj', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKl', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKm', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPKn', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPKo', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKs', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKt', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKv', 'type': 'I'}
@@ -2256,11 +2339,14 @@
 {'is_defined': True, 'name': '__ZTSPd', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPe', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPf', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPg', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPj', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPl', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPm', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPn', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPo', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPs', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPt', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPv', 'type': 'I'}
@@ -2293,11 +2379,14 @@
 {'is_defined': True, 'name': '__ZTSd', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSe', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSf', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSg', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSj', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSl', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSm', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSn', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSo', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSs', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSt', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSv', 'type': 'I'}
@@ -2318,6 +2407,7 @@
 {'is_defined': True, 'name': '__ZTTNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTTNSt3__19strstreamE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTVN10__cxxabiv116__enum_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTVN10__cxxabiv116__shim_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTVN10__cxxabiv117__array_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTVN10__cxxabiv117__class_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTVN10__cxxabiv117__pbase_type_infoE', 'type': 'I'}
@@ -2501,6 +2591,8 @@
 {'is_defined': True, 'name': '___cxa_begin_catch', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_call_unexpected', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_current_exception_type', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_current_primary_exception', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_decrement_exception_refcount', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_deleted_virtual', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_demangle', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_end_catch', 'type': 'I'}
@@ -2512,12 +2604,18 @@
 {'is_defined': True, 'name': '___cxa_guard_abort', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_guard_acquire', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_guard_release', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_increment_exception_refcount', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_init_primary_exception', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_new_handler', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_pure_virtual', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_rethrow', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_rethrow_primary_exception', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_terminate_handler', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_throw', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_throw_bad_array_new_length', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_uncaught_exception', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_uncaught_exceptions', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_unexpected_handler', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_vec_cctor', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_vec_cleanup', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_vec_ctor', 'type': 'I'}
diff --git a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
index bced6b2ea81b..c169b4a99252 100644
--- a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -70,26 +70,46 @@
 {'is_defined': False, 'name': '__ZSt15get_new_handlerv', 'type': 'U'}
 {'is_defined': False, 'name': '__ZSt15set_new_handlerPFvvE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZSt9terminatev', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIDh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIDi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIDn', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIDs', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIDu', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv116__enum_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv116__shim_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv117__array_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv117__class_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv117__pbase_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv119__pointer_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv120__function_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv120__si_class_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv121__vmi_class_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv123__fundamental_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv129__pointer_to_member_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPDh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPDi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPDn', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPDs', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPDu', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPKDh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKDi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKDn', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKDs', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPKDu', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKa', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKb', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKc', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKd', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKe', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKf', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPKg', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKj', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKl', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKm', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPKn', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPKo', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKs', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKt', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKv', 'type': 'U'}
@@ -102,11 +122,14 @@
 {'is_defined': False, 'name': '__ZTIPd', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPe', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPf', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPg', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPj', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPl', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPm', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPn', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPo', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPs', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPt', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPv', 'type': 'U'}
@@ -135,21 +158,27 @@
 {'is_defined': False, 'name': '__ZTId', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIe', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIf', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIg', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIj', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIl', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIm', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIn', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIo', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIs', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIt', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIv', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIw', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIx', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIy', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSDh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSDi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSDn', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSDs', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSDu', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv116__enum_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSN10__cxxabiv116__shim_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv117__array_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv117__class_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv117__pbase_type_infoE', 'type': 'U'}
@@ -159,23 +188,30 @@
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv121__vmi_class_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv123__fundamental_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv129__pointer_to_member_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPDh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPDi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPDn', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPDs', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPDu', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPKDh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKDi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKDn', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKDs', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPKDu', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKa', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKb', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKc', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKd', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKe', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKf', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPKg', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKj', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKl', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKm', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPKn', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPKo', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKs', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKt', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKv', 'type': 'U'}
@@ -188,11 +224,14 @@
 {'is_defined': False, 'name': '__ZTSPd', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPe', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPf', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPg', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPj', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPl', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPm', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPn', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPo', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPs', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPt', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPv', 'type': 'U'}
@@ -221,11 +260,14 @@
 {'is_defined': False, 'name': '__ZTSd', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSe', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSf', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSg', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSj', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSl', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSm', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSn', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSo', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSs', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSt', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSv', 'type': 'U'}
@@ -233,6 +275,7 @@
 {'is_defined': False, 'name': '__ZTSx', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSy', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTVN10__cxxabiv116__enum_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTVN10__cxxabiv116__shim_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTVN10__cxxabiv117__array_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTVN10__cxxabiv117__class_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTVN10__cxxabiv117__pbase_type_infoE', 'type': 'U'}
@@ -301,12 +344,16 @@
 {'is_defined': False, 'name': '___cxa_guard_release', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_increment_exception_refcount', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_init_primary_exception', 'type': 'U'}
+{'is_defined': False, 'name': '___cxa_new_handler', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_pure_virtual', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_rethrow', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_rethrow_primary_exception', 'type': 'U'}
+{'is_defined': False, 'name': '___cxa_terminate_handler', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_throw', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_throw_bad_array_new_length', 'type': 'U'}
+{'is_defined': False, 'name': '___cxa_uncaught_exception', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_uncaught_exceptions', 'type': 'U'}
+{'is_defined': False, 'name': '___cxa_unexpected_handler', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_vec_cctor', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_vec_cleanup', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_vec_ctor', 'type': 'U'}
@@ -1945,9 +1992,22 @@
 {'is_defined': True, 'name': '__ZTCNSt3__19strstreamE0_NS_13basic_istreamIcNS_11char_traitsIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTCNSt3__19strstreamE0_NS_14basic_iostreamIcNS_11char_traitsIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTCNSt3__19strstreamE16_NS_13basic_ostreamIcNS_11char_traitsIcEEEE', 'size': 0, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '__ZTIDh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIDi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIDn', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIDs', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIDu', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv116__enum_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv116__shim_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv117__array_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv117__class_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv117__pbase_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv119__pointer_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv120__function_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv120__si_class_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv121__vmi_class_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv123__fundamental_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv129__pointer_to_member_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTINSt12experimental15fundamentals_v112bad_any_castE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt12experimental19bad_optional_accessE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt3__110__time_getE', 'size': 0, 'type': 'OBJECT'}
@@ -2078,23 +2138,30 @@
 {'is_defined': True, 'name': '__ZTINSt3__19money_putIwNS_19ostreambuf_iteratorIwNS_11char_traitsIwEEEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt3__19strstreamE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt3__19time_baseE', 'size': 0, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '__ZTIPDh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPDi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPDn', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPDs', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPDu', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPKDh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKDi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKDn', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKDs', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPKDu', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKa', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKb', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKc', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKd', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKe', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKf', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPKg', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKj', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKl', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKm', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPKn', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPKo', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKs', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKt', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKv', 'type': 'I'}
@@ -2107,11 +2174,14 @@
 {'is_defined': True, 'name': '__ZTIPd', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPe', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPf', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPg', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPj', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPl', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPm', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPn', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPo', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPs', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPt', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPv', 'type': 'I'}
@@ -2144,21 +2214,27 @@
 {'is_defined': True, 'name': '__ZTId', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIe', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIf', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIg', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIj', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIl', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIm', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIn', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIo', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIs', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIt', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIv', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIw', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIx', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIy', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSDh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSDi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSDn', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSDs', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSDu', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSN10__cxxabiv116__enum_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSN10__cxxabiv116__shim_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSN10__cxxabiv117__array_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSN10__cxxabiv117__class_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSN10__cxxabiv117__pbase_type_infoE', 'type': 'I'}
@@ -2261,23 +2337,30 @@
 {'is_defined': True, 'name': '__ZTSNSt3__19money_putIwNS_19ostreambuf_iteratorIwNS_11char_traitsIwEEEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTSNSt3__19strstreamE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTSNSt3__19time_baseE', 'size': 0, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '__ZTSPDh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPDi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPDn', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPDs', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPDu', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPKDh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKDi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKDn', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKDs', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPKDu', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKa', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKb', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKc', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKd', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKe', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKf', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPKg', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKj', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKl', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKm', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPKn', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPKo', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKs', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKt', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKv', 'type': 'I'}
@@ -2290,11 +2373,14 @@
 {'is_defined': True, 'name': '__ZTSPd', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPe', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPf', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPg', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPj', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPl', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPm', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPn', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPo', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPs', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPt', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPv', 'type': 'I'}
@@ -2327,11 +2413,14 @@
 {'is_defined': True, 'name': '__ZTSd', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSe', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSf', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSg', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSj', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSl', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSm', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSn', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSo', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSs', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSt', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSv', 'type': 'I'}
@@ -2352,6 +2441,7 @@
 {'is_defined': True, 'name': '__ZTTNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTTNSt3__19strstreamE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTVN10__cxxabiv116__enum_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTVN10__cxxabiv116__shim_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTVN10__cxxabiv117__array_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTVN10__cxxabiv117__class_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTVN10__cxxabiv117__pbase_type_infoE', 'type': 'I'}
@@ -2535,6 +2625,8 @@
 {'is_defined': True, 'name': '___cxa_begin_catch', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_call_unexpected', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_current_exception_type', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_current_primary_exception', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_decrement_exception_refcount', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_deleted_virtual', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_demangle', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_end_catch', 'type': 'I'}
@@ -2546,12 +2638,18 @@
 {'is_defined': True, 'name': '___cxa_guard_abort', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_guard_acquire', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_guard_release', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_increment_exception_refcount', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_init_primary_exception', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_new_handler', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_pure_virtual', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_rethrow', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_rethrow_primary_exception', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_terminate_handler', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_throw', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_throw_bad_array_new_length', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_uncaught_exception', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_uncaught_exceptions', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_unexpected_handler', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_vec_cctor', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_vec_cleanup', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_vec_ctor', 'type': 'I'}
diff --git a/libcxx/modules/modules.json.in b/libcxx/modules/modules.json.in
index ddc377f28f91..759ac92d81f1 100644
--- a/libcxx/modules/modules.json.in
+++ b/libcxx/modules/modules.json.in
@@ -5,7 +5,7 @@
     {
       "logical-name": "std",
       "source-path": "@LIBCXX_MODULE_RELATIVE_PATH@/std.cppm",
-      "is-standard-library": true,
+      "is-std-library": true,
       "local-arguments": {
         "system-include-directories": [
           "@LIBCXX_MODULE_RELATIVE_PATH@"
diff --git a/libcxx/modules/std/atomic.inc b/libcxx/modules/std/atomic.inc
index 88b31ccdb208..2b54cef863e5 100644
--- a/libcxx/modules/std/atomic.inc
+++ b/libcxx/modules/std/atomic.inc
@@ -111,8 +111,10 @@ export namespace std {
   using std::atomic_uintmax_t;
   using std::atomic_uintptr_t;
 
+#ifndef _LIBCPP_NO_LOCK_FREE_TYPES
   using std::atomic_signed_lock_free;
   using std::atomic_unsigned_lock_free;
+#endif
 
   // [atomics.flag], flag type and operations
   using std::atomic_flag;
diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index cc6954a7bac3..07ffc8bfdaae 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -231,19 +231,14 @@ if (LIBCXX_ENABLE_SHARED)
   # In particular, we don't re-export the symbols if libc++abi is merged statically
   # into libc++ because in that case there's no dylib to re-export from.
   if (APPLE AND LIBCXX_CXX_ABI MATCHES "libcxxabi$"
-            AND NOT DEFINED LIBCXX_OSX_REEXPORT_LIBCXXABI_SYMBOLS
             AND NOT LIBCXX_STATICALLY_LINK_ABI_IN_SHARED_LIBRARY)
-    set(LIBCXX_OSX_REEXPORT_LIBCXXABI_SYMBOLS ON)
-  endif()
+    target_link_libraries(cxx_shared PRIVATE cxxabi-reexports)
 
-  if (LIBCXX_OSX_REEXPORT_LIBCXXABI_SYMBOLS)
+    # TODO: These exports controls should not be tied to whether we re-export libc++abi symbols
     target_link_libraries(cxx_shared PRIVATE
       "-Wl,-unexported_symbols_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/libc++unexp.exp"
-      "-Wl,-reexported_symbols_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/libc++abi.exp"
       "-Wl,-force_symbols_not_weak_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/notweak.exp"
       "-Wl,-force_symbols_weak_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/weak.exp")
-
-    target_link_libraries(cxx_shared PRIVATE $<TARGET_NAME_IF_EXISTS:cxxabi-reexports>)
   endif()
 
   # Generate a linker script in place of a libc++.so symlink.
diff --git a/libcxx/test/CMakeLists.txt b/libcxx/test/CMakeLists.txt
index 52620fc55fee..e0d3a0dbc400 100644
--- a/libcxx/test/CMakeLists.txt
+++ b/libcxx/test/CMakeLists.txt
@@ -10,11 +10,6 @@ endif()
 set(AUTO_GEN_COMMENT "## Autogenerated by libcxx configuration.\n# Do not edit!")
 set(SERIALIZED_LIT_PARAMS "# Lit parameters serialized here for llvm-lit to pick them up\n")
 
-if (LIBCXX_EXECUTOR)
-  message(DEPRECATION "LIBCXX_EXECUTOR is deprecated, please add executor=... to LIBCXX_TEST_PARAMS")
-  serialize_lit_string_param(SERIALIZED_LIT_PARAMS executor "${LIBCXX_EXECUTOR}")
-endif()
-
 if (NOT LIBCXX_ENABLE_EXCEPTIONS)
   serialize_lit_param(SERIALIZED_LIT_PARAMS enable_exceptions False)
 endif()
diff --git a/libcxx/test/std/utilities/allocator.adaptor/base-is-uglified.compile.pass.cpp b/libcxx/test/std/utilities/allocator.adaptor/base-is-uglified.compile.pass.cpp
new file mode 100644
index 000000000000..2581ac079dc5
--- /dev/null
+++ b/libcxx/test/std/utilities/allocator.adaptor/base-is-uglified.compile.pass.cpp
@@ -0,0 +1,27 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+
+// <memory>
+
+// This test ensures that we don't use a non-uglified name 'base' in the
+// implementation of scoped_allocator_adaptor.
+//
+// See https://github.com/llvm/llvm-project/issues/78754.
+
+#include <memory>
+#include <scoped_allocator>
+
+using ScopedAlloc = std::scoped_allocator_adaptor<std::allocator<int>, std::allocator<int>>;
+struct MyBase {
+  using base = MyBase;
+};
+struct MyDerived : ScopedAlloc, MyBase {};
+
+using T = MyDerived::base; // Should be well-formed
diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot
index ddfe53d2d8bd..2905745355b6 100755
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@@ -758,7 +758,7 @@ android-ndk-*)
     # level. When tests are run against a device with a newer API level, test
     # programs can be built for any supported API level, but building for the
     # newest API (i.e. the system image's API) is probably the most interesting.
-    PARAMS="target_triple=$(triple_of_arch ${ARCH})$(api_of_emu_img ${ANDROID_EMU_IMG})"
+    PARAMS="executor=${MONOREPO_ROOT}/libcxx/utils/adb_run.py;target_triple=$(triple_of_arch ${ARCH})$(api_of_emu_img ${ANDROID_EMU_IMG})"
     generate-cmake-android -C "${MONOREPO_ROOT}/runtimes/cmake/android/Arch-${ARCH}.cmake" \
                            -C "${MONOREPO_ROOT}/libcxx/cmake/caches/AndroidNDK.cmake" \
                            -DCMAKE_SYSROOT=/opt/android/ndk/sysroot \
diff --git a/libcxxabi/lib/cxxabiv1.exp b/libcxxabi/lib/cxxabiv1.exp
new file mode 100644
index 000000000000..b1bab45ef334
--- /dev/null
+++ b/libcxxabi/lib/cxxabiv1.exp
@@ -0,0 +1,38 @@
+# Typeinfos for types from libc++abi
+__ZTIN10__cxxabiv116__enum_type_infoE
+__ZTIN10__cxxabiv116__shim_type_infoE
+__ZTIN10__cxxabiv117__array_type_infoE
+__ZTIN10__cxxabiv117__class_type_infoE
+__ZTIN10__cxxabiv117__pbase_type_infoE
+__ZTIN10__cxxabiv119__pointer_type_infoE
+__ZTIN10__cxxabiv120__function_type_infoE
+__ZTIN10__cxxabiv120__si_class_type_infoE
+__ZTIN10__cxxabiv121__vmi_class_type_infoE
+__ZTIN10__cxxabiv123__fundamental_type_infoE
+__ZTIN10__cxxabiv129__pointer_to_member_type_infoE
+
+# Typeinfo names for types from libc++abi
+__ZTSN10__cxxabiv116__enum_type_infoE
+__ZTSN10__cxxabiv116__shim_type_infoE
+__ZTSN10__cxxabiv117__array_type_infoE
+__ZTSN10__cxxabiv117__class_type_infoE
+__ZTSN10__cxxabiv117__pbase_type_infoE
+__ZTSN10__cxxabiv119__pointer_type_infoE
+__ZTSN10__cxxabiv120__function_type_infoE
+__ZTSN10__cxxabiv120__si_class_type_infoE
+__ZTSN10__cxxabiv121__vmi_class_type_infoE
+__ZTSN10__cxxabiv123__fundamental_type_infoE
+__ZTSN10__cxxabiv129__pointer_to_member_type_infoE
+
+# Vtables for libc++abi types
+__ZTVN10__cxxabiv116__enum_type_infoE
+__ZTVN10__cxxabiv116__shim_type_infoE
+__ZTVN10__cxxabiv117__array_type_infoE
+__ZTVN10__cxxabiv117__class_type_infoE
+__ZTVN10__cxxabiv117__pbase_type_infoE
+__ZTVN10__cxxabiv119__pointer_type_infoE
+__ZTVN10__cxxabiv120__function_type_infoE
+__ZTVN10__cxxabiv120__si_class_type_infoE
+__ZTVN10__cxxabiv121__vmi_class_type_infoE
+__ZTVN10__cxxabiv123__fundamental_type_infoE
+__ZTVN10__cxxabiv129__pointer_to_member_type_infoE
diff --git a/libcxxabi/lib/fundamental-types.exp b/libcxxabi/lib/fundamental-types.exp
new file mode 100644
index 000000000000..fe21022328fe
--- /dev/null
+++ b/libcxxabi/lib/fundamental-types.exp
@@ -0,0 +1,153 @@
+# Typeinfos for fundamental types
+__ZTIa
+__ZTIb
+__ZTIc
+__ZTId
+__ZTIDh
+__ZTIDi
+__ZTIDn
+__ZTIDs
+__ZTIDu
+__ZTIe
+__ZTIf
+__ZTIg
+__ZTIh
+__ZTIi
+__ZTIj
+__ZTIl
+__ZTIm
+__ZTIn
+__ZTIo
+__ZTIPa
+__ZTIPb
+__ZTIPc
+__ZTIPd
+__ZTIPDh
+__ZTIPDi
+__ZTIPDn
+__ZTIPDs
+__ZTIPDu
+__ZTIPe
+__ZTIPf
+__ZTIPg
+__ZTIPh
+__ZTIPi
+__ZTIPj
+__ZTIPKa
+__ZTIPKb
+__ZTIPKc
+__ZTIPKd
+__ZTIPKDh
+__ZTIPKDi
+__ZTIPKDn
+__ZTIPKDs
+__ZTIPKDu
+__ZTIPKe
+__ZTIPKf
+__ZTIPKg
+__ZTIPKh
+__ZTIPKi
+__ZTIPKj
+__ZTIPKl
+__ZTIPKm
+__ZTIPKn
+__ZTIPKo
+__ZTIPKs
+__ZTIPKt
+__ZTIPKv
+__ZTIPKw
+__ZTIPKx
+__ZTIPKy
+__ZTIPl
+__ZTIPm
+__ZTIPn
+__ZTIPo
+__ZTIPs
+__ZTIPt
+__ZTIPv
+__ZTIPw
+__ZTIPx
+__ZTIPy
+__ZTIs
+__ZTIt
+__ZTIv
+__ZTIw
+__ZTIx
+__ZTIy
+
+# Typeinfo names for fundamental types
+__ZTSa
+__ZTSb
+__ZTSc
+__ZTSd
+__ZTSDh
+__ZTSDi
+__ZTSDn
+__ZTSDs
+__ZTSDu
+__ZTSe
+__ZTSf
+__ZTSg
+__ZTSh
+__ZTSi
+__ZTSj
+__ZTSl
+__ZTSm
+__ZTSn
+__ZTSo
+__ZTSPa
+__ZTSPb
+__ZTSPc
+__ZTSPd
+__ZTSPDh
+__ZTSPDi
+__ZTSPDn
+__ZTSPDs
+__ZTSPDu
+__ZTSPe
+__ZTSPf
+__ZTSPg
+__ZTSPh
+__ZTSPi
+__ZTSPj
+__ZTSPKa
+__ZTSPKb
+__ZTSPKc
+__ZTSPKd
+__ZTSPKDh
+__ZTSPKDi
+__ZTSPKDn
+__ZTSPKDs
+__ZTSPKDu
+__ZTSPKe
+__ZTSPKf
+__ZTSPKg
+__ZTSPKh
+__ZTSPKi
+__ZTSPKj
+__ZTSPKl
+__ZTSPKm
+__ZTSPKn
+__ZTSPKo
+__ZTSPKs
+__ZTSPKt
+__ZTSPKv
+__ZTSPKw
+__ZTSPKx
+__ZTSPKy
+__ZTSPl
+__ZTSPm
+__ZTSPn
+__ZTSPo
+__ZTSPs
+__ZTSPt
+__ZTSPv
+__ZTSPw
+__ZTSPx
+__ZTSPy
+__ZTSs
+__ZTSt
+__ZTSv
+__ZTSw
+__ZTSx
+__ZTSy
diff --git a/libcxxabi/lib/itanium-base.exp b/libcxxabi/lib/itanium-base.exp
index 1b14cb8fba17..002e062df423 100644
--- a/libcxxabi/lib/itanium-base.exp
+++ b/libcxxabi/lib/itanium-base.exp
@@ -1,250 +1,3 @@
-# Typeinfos for fundamental types
-__ZTIa
-__ZTIb
-__ZTIc
-__ZTId
-__ZTIDh
-__ZTIDi
-__ZTIDn
-__ZTIDs
-__ZTIDu
-__ZTIe
-__ZTIf
-__ZTIg
-__ZTIh
-__ZTIi
-__ZTIj
-__ZTIl
-__ZTIm
-__ZTIn
-__ZTIo
-__ZTIPa
-__ZTIPb
-__ZTIPc
-__ZTIPd
-__ZTIPDh
-__ZTIPDi
-__ZTIPDn
-__ZTIPDs
-__ZTIPDu
-__ZTIPe
-__ZTIPf
-__ZTIPg
-__ZTIPh
-__ZTIPi
-__ZTIPj
-__ZTIPKa
-__ZTIPKb
-__ZTIPKc
-__ZTIPKd
-__ZTIPKDh
-__ZTIPKDi
-__ZTIPKDn
-__ZTIPKDs
-__ZTIPKDu
-__ZTIPKe
-__ZTIPKf
-__ZTIPKg
-__ZTIPKh
-__ZTIPKi
-__ZTIPKj
-__ZTIPKl
-__ZTIPKm
-__ZTIPKn
-__ZTIPKo
-__ZTIPKs
-__ZTIPKt
-__ZTIPKv
-__ZTIPKw
-__ZTIPKx
-__ZTIPKy
-__ZTIPl
-__ZTIPm
-__ZTIPn
-__ZTIPo
-__ZTIPs
-__ZTIPt
-__ZTIPv
-__ZTIPw
-__ZTIPx
-__ZTIPy
-__ZTIs
-__ZTIt
-__ZTIv
-__ZTIw
-__ZTIx
-__ZTIy
-
-# Typeinfo names for fundamental types
-__ZTSa
-__ZTSb
-__ZTSc
-__ZTSd
-__ZTSDh
-__ZTSDi
-__ZTSDn
-__ZTSDs
-__ZTSDu
-__ZTSe
-__ZTSf
-__ZTSg
-__ZTSh
-__ZTSi
-__ZTSj
-__ZTSl
-__ZTSm
-__ZTSn
-__ZTSo
-__ZTSPa
-__ZTSPb
-__ZTSPc
-__ZTSPd
-__ZTSPDh
-__ZTSPDi
-__ZTSPDn
-__ZTSPDs
-__ZTSPDu
-__ZTSPe
-__ZTSPf
-__ZTSPg
-__ZTSPh
-__ZTSPi
-__ZTSPj
-__ZTSPKa
-__ZTSPKb
-__ZTSPKc
-__ZTSPKd
-__ZTSPKDh
-__ZTSPKDi
-__ZTSPKDn
-__ZTSPKDs
-__ZTSPKDu
-__ZTSPKe
-__ZTSPKf
-__ZTSPKg
-__ZTSPKh
-__ZTSPKi
-__ZTSPKj
-__ZTSPKl
-__ZTSPKm
-__ZTSPKn
-__ZTSPKo
-__ZTSPKs
-__ZTSPKt
-__ZTSPKv
-__ZTSPKw
-__ZTSPKx
-__ZTSPKy
-__ZTSPl
-__ZTSPm
-__ZTSPn
-__ZTSPo
-__ZTSPs
-__ZTSPt
-__ZTSPv
-__ZTSPw
-__ZTSPx
-__ZTSPy
-__ZTSs
-__ZTSt
-__ZTSv
-__ZTSw
-__ZTSx
-__ZTSy
-
-# Typeinfos for types from libc++abi
-__ZTIN10__cxxabiv116__enum_type_infoE
-__ZTIN10__cxxabiv116__shim_type_infoE
-__ZTIN10__cxxabiv117__array_type_infoE
-__ZTIN10__cxxabiv117__class_type_infoE
-__ZTIN10__cxxabiv117__pbase_type_infoE
-__ZTIN10__cxxabiv119__pointer_type_infoE
-__ZTIN10__cxxabiv120__function_type_infoE
-__ZTIN10__cxxabiv120__si_class_type_infoE
-__ZTIN10__cxxabiv121__vmi_class_type_infoE
-__ZTIN10__cxxabiv123__fundamental_type_infoE
-__ZTIN10__cxxabiv129__pointer_to_member_type_infoE
-
-# Typeinfo names for types from libc++abi
-__ZTSN10__cxxabiv116__enum_type_infoE
-__ZTSN10__cxxabiv116__shim_type_infoE
-__ZTSN10__cxxabiv117__array_type_infoE
-__ZTSN10__cxxabiv117__class_type_infoE
-__ZTSN10__cxxabiv117__pbase_type_infoE
-__ZTSN10__cxxabiv119__pointer_type_infoE
-__ZTSN10__cxxabiv120__function_type_infoE
-__ZTSN10__cxxabiv120__si_class_type_infoE
-__ZTSN10__cxxabiv121__vmi_class_type_infoE
-__ZTSN10__cxxabiv123__fundamental_type_infoE
-__ZTSN10__cxxabiv129__pointer_to_member_type_infoE
-
-# Typeinfos for std:: exception types
-__ZTISt10bad_typeid
-__ZTISt11logic_error
-__ZTISt11range_error
-__ZTISt12domain_error
-__ZTISt12length_error
-__ZTISt12out_of_range
-__ZTISt13bad_exception
-__ZTISt13runtime_error
-__ZTISt14overflow_error
-__ZTISt15underflow_error
-__ZTISt16invalid_argument
-__ZTISt20bad_array_new_length
-__ZTISt8bad_cast
-__ZTISt9bad_alloc
-__ZTISt9exception
-__ZTISt9type_info
-
-# Typeinfo names for std:: exception types
-__ZTSSt10bad_typeid
-__ZTSSt11logic_error
-__ZTSSt11range_error
-__ZTSSt12domain_error
-__ZTSSt12length_error
-__ZTSSt12out_of_range
-__ZTSSt13bad_exception
-__ZTSSt13runtime_error
-__ZTSSt14overflow_error
-__ZTSSt15underflow_error
-__ZTSSt16invalid_argument
-__ZTSSt20bad_array_new_length
-__ZTSSt8bad_cast
-__ZTSSt9bad_alloc
-__ZTSSt9exception
-__ZTSSt9type_info
-
-# Vtables for libc++abi types
-__ZTVN10__cxxabiv116__enum_type_infoE
-__ZTVN10__cxxabiv116__shim_type_infoE
-__ZTVN10__cxxabiv117__array_type_infoE
-__ZTVN10__cxxabiv117__class_type_infoE
-__ZTVN10__cxxabiv117__pbase_type_infoE
-__ZTVN10__cxxabiv119__pointer_type_infoE
-__ZTVN10__cxxabiv120__function_type_infoE
-__ZTVN10__cxxabiv120__si_class_type_infoE
-__ZTVN10__cxxabiv121__vmi_class_type_infoE
-__ZTVN10__cxxabiv123__fundamental_type_infoE
-__ZTVN10__cxxabiv129__pointer_to_member_type_infoE
-
-# Vtables for std:: exception types
-__ZTVSt10bad_typeid
-__ZTVSt11logic_error
-__ZTVSt11range_error
-__ZTVSt12domain_error
-__ZTVSt12length_error
-__ZTVSt12out_of_range
-__ZTVSt13bad_exception
-__ZTVSt13runtime_error
-__ZTVSt14overflow_error
-__ZTVSt15underflow_error
-__ZTVSt16invalid_argument
-__ZTVSt20bad_array_new_length
-__ZTVSt8bad_cast
-__ZTVSt9bad_alloc
-__ZTVSt9exception
-__ZTVSt9type_info
-
 # Itanium C++ ABI requirements (minus most exception support)
 ___cxa_bad_cast
 ___cxa_bad_typeid
@@ -277,81 +30,3 @@ ___dynamic_cast
 ___cxa_terminate_handler
 ___cxa_unexpected_handler
 ___cxa_new_handler
-
-# ::what() functions for std:: exception types
-__ZNKSt10bad_typeid4whatEv
-__ZNKSt11logic_error4whatEv
-__ZNKSt13bad_exception4whatEv
-__ZNKSt13runtime_error4whatEv
-__ZNKSt20bad_array_new_length4whatEv
-__ZNKSt8bad_cast4whatEv
-__ZNKSt9bad_alloc4whatEv
-__ZNKSt9exception4whatEv
-
-# Default constructors and destructors for std:: exception types
-__ZNSt10bad_typeidC1Ev
-__ZNSt10bad_typeidC2Ev
-__ZNSt10bad_typeidD0Ev
-__ZNSt10bad_typeidD1Ev
-__ZNSt10bad_typeidD2Ev
-__ZNSt11logic_errorD0Ev
-__ZNSt11logic_errorD1Ev
-__ZNSt11logic_errorD2Ev
-__ZNSt11range_errorD0Ev
-__ZNSt11range_errorD1Ev
-__ZNSt11range_errorD2Ev
-__ZNSt12domain_errorD0Ev
-__ZNSt12domain_errorD1Ev
-__ZNSt12domain_errorD2Ev
-__ZNSt12length_errorD0Ev
-__ZNSt12length_errorD1Ev
-__ZNSt12length_errorD2Ev
-__ZNSt12out_of_rangeD0Ev
-__ZNSt12out_of_rangeD1Ev
-__ZNSt12out_of_rangeD2Ev
-__ZNSt13bad_exceptionD0Ev
-__ZNSt13bad_exceptionD1Ev
-__ZNSt13bad_exceptionD2Ev
-__ZNSt13runtime_errorD0Ev
-__ZNSt13runtime_errorD1Ev
-__ZNSt13runtime_errorD2Ev
-__ZNSt14overflow_errorD0Ev
-__ZNSt14overflow_errorD1Ev
-__ZNSt14overflow_errorD2Ev
-__ZNSt15underflow_errorD0Ev
-__ZNSt15underflow_errorD1Ev
-__ZNSt15underflow_errorD2Ev
-__ZNSt16invalid_argumentD0Ev
-__ZNSt16invalid_argumentD1Ev
-__ZNSt16invalid_argumentD2Ev
-__ZNSt20bad_array_new_lengthC1Ev
-__ZNSt20bad_array_new_lengthC2Ev
-__ZNSt20bad_array_new_lengthD0Ev
-__ZNSt20bad_array_new_lengthD1Ev
-__ZNSt20bad_array_new_lengthD2Ev
-__ZNSt8bad_castC1Ev
-__ZNSt8bad_castC2Ev
-__ZNSt8bad_castD0Ev
-__ZNSt8bad_castD1Ev
-__ZNSt8bad_castD2Ev
-__ZNSt9bad_allocC1Ev
-__ZNSt9bad_allocC2Ev
-__ZNSt9bad_allocD0Ev
-__ZNSt9bad_allocD1Ev
-__ZNSt9bad_allocD2Ev
-__ZNSt9exceptionD0Ev
-__ZNSt9exceptionD1Ev
-__ZNSt9exceptionD2Ev
-__ZNSt9type_infoD0Ev
-__ZNSt9type_infoD1Ev
-__ZNSt9type_infoD2Ev
-
-# Other std:: functions implemented in libc++abi
-__ZSt10unexpectedv
-__ZSt13get_terminatev
-__ZSt13set_terminatePFvvE
-__ZSt14get_unexpectedv
-__ZSt14set_unexpectedPFvvE
-__ZSt15get_new_handlerv
-__ZSt15set_new_handlerPFvvE
-__ZSt9terminatev
diff --git a/libcxxabi/lib/exceptions.exp b/libcxxabi/lib/itanium-exceptions.exp
index 9dcfbdbd3598..21b19edb92df 100644
--- a/libcxxabi/lib/exceptions.exp
+++ b/libcxxabi/lib/itanium-exceptions.exp
@@ -1,3 +1,4 @@
+# Itanium C++ ABI requirements related to exceptions
 ___cxa_allocate_dependent_exception
 ___cxa_allocate_exception
 ___cxa_begin_catch
diff --git a/libcxxabi/lib/new-delete.exp b/libcxxabi/lib/new-delete.exp
index 086d2fec24ea..4f97de7e8bb6 100644
--- a/libcxxabi/lib/new-delete.exp
+++ b/libcxxabi/lib/new-delete.exp
@@ -1,3 +1,4 @@
+# Symbols for all variants of global operator new and delete
 __Znwm
 __ZnwmRKSt9nothrow_t
 __ZnwmSt11align_val_t
diff --git a/libcxx/lib/libc++abi.exp b/libcxxabi/lib/std-exceptions.exp
index 6a3e6b9f0222..bb4a2b2e93ea 100644
--- a/libcxx/lib/libc++abi.exp
+++ b/libcxxabi/lib/std-exceptions.exp
@@ -1,295 +1,121 @@
-___cxa_demangle
-___cxa_get_globals
-___cxa_get_globals_fast
-___cxa_guard_abort
-___cxa_guard_acquire
-___cxa_guard_release
-___cxa_pure_virtual
-___cxa_deleted_virtual
-___cxa_throw_bad_array_new_length
-___cxa_uncaught_exceptions
-___cxa_vec_cctor
-___cxa_vec_cleanup
-___cxa_vec_ctor
-___cxa_vec_delete
-___cxa_vec_delete2
-___cxa_vec_delete3
-___cxa_vec_dtor
-___cxa_vec_new
-___cxa_vec_new2
-___cxa_vec_new3
-___dynamic_cast
-__ZTIDi
-__ZTIDn
-__ZTIDs
-__ZTIPDi
-__ZTIPDn
-__ZTIPDs
-__ZTIPKDi
-__ZTIPKDn
-__ZTIPKDs
-__ZTSPm
-__ZTSPl
-__ZTSPj
-__ZTSPi
-__ZTSPh
-__ZTSPf
-__ZTSPe
-__ZTSPd
-__ZTSPc
-__ZTSPb
-__ZTSPa
-__ZTSPKc
-__ZTSPKy
-__ZTSPKx
-__ZTSPKw
-__ZTSPKv
-__ZTSPKt
-__ZTSPKs
-__ZTSPKm
-__ZTSPKl
-__ZTSPKi
-__ZTSPKh
-__ZTSPs
-__ZTSPt
-__ZTSPv
-__ZTSPw
-__ZTSPKa
-__ZTSPx
-__ZTSPy
-__ZTSPKd
-__ZTSPKe
-__ZTSPKj
-__ZTSPKb
-__ZTSPKf
-__ZTSv
-__ZTSt
-__ZTSs
-__ZTSm
-__ZTSl
-__ZTSj
-__ZTSi
-__ZTSh
-__ZTSf
-__ZTSe
-__ZTSd
-__ZTSc
-__ZTSw
-__ZTSx
-__ZTSy
-__ZTSb
-__ZTSa
-__ZTIPKh
-__ZTIPKf
-__ZTIPKe
-__ZTIPKd
-__ZTIPKc
-__ZTIPKb
-__ZTIPKa
-__ZTIPy
-__ZTIPx
-__ZTIPw
-__ZTIPv
-__ZTIPt
-__ZTIPs
-__ZTIPm
-__ZTIPl
-__ZTIPj
-__ZTIPi
-__ZTIPKi
-__ZTIPKj
-__ZTIPKl
-__ZTIPKm
-__ZTIPKs
-__ZTIPKt
-__ZTIPKv
-__ZTIPKw
-__ZTIPKx
-__ZTIPKy
-__ZTIPa
-__ZTIPb
-__ZTIPc
-__ZTIPd
-__ZTIPe
-__ZTIPf
-__ZTIPh
-__ZTVN10__cxxabiv129__pointer_to_member_type_infoE
-__ZTVN10__cxxabiv116__enum_type_infoE
-__ZTVN10__cxxabiv117__array_type_infoE
-__ZTVN10__cxxabiv117__class_type_infoE
-__ZTVN10__cxxabiv117__pbase_type_infoE
-__ZTVN10__cxxabiv119__pointer_type_infoE
-__ZTVN10__cxxabiv120__function_type_infoE
-__ZTVN10__cxxabiv120__si_class_type_infoE
-__ZTVN10__cxxabiv121__vmi_class_type_infoE
-__ZTVN10__cxxabiv123__fundamental_type_infoE
-__ZTIa
-__ZTIb
-__ZTIc
-__ZTId
-__ZTIe
-__ZTIf
-__ZTIh
-__ZTIi
-__ZTIj
-__ZTIl
-__ZTIm
-__ZTIs
-__ZTIt
-__ZTSN10__cxxabiv129__pointer_to_member_type_infoE
-__ZTSN10__cxxabiv123__fundamental_type_infoE
-__ZTSN10__cxxabiv121__vmi_class_type_infoE
-__ZTSN10__cxxabiv120__si_class_type_infoE
-__ZTSN10__cxxabiv120__function_type_infoE
-__ZTSN10__cxxabiv119__pointer_type_infoE
-__ZTSN10__cxxabiv117__pbase_type_infoE
-__ZTSN10__cxxabiv117__class_type_infoE
-__ZTSN10__cxxabiv117__array_type_infoE
-__ZTSN10__cxxabiv116__enum_type_infoE
-__ZTIy
-__ZTIx
-__ZTIw
-__ZTIv
-__ZSt13get_terminatev
-__ZSt13set_terminatePFvvE
-__ZSt14get_unexpectedv
-__ZSt14set_unexpectedPFvvE
-__ZSt15get_new_handlerv
-__ZSt15set_new_handlerPFvvE
-__ZSt9terminatev
-__ZNSt9bad_allocD1Ev
-__ZTISt9bad_alloc
-__ZNSt9bad_allocC1Ev
+# Typeinfos for std:: exception types
+__ZTISt10bad_typeid
+__ZTISt11logic_error
+__ZTISt11range_error
+__ZTISt12domain_error
+__ZTISt12length_error
+__ZTISt12out_of_range
 __ZTISt13bad_exception
+__ZTISt13runtime_error
+__ZTISt14overflow_error
+__ZTISt15underflow_error
+__ZTISt16invalid_argument
+__ZTISt20bad_array_new_length
+__ZTISt8bad_cast
+__ZTISt9bad_alloc
+__ZTISt9exception
+__ZTISt9type_info
+
+# Typeinfo names for std:: exception types
+__ZTSSt10bad_typeid
+__ZTSSt11logic_error
+__ZTSSt11range_error
+__ZTSSt12domain_error
+__ZTSSt12length_error
+__ZTSSt12out_of_range
+__ZTSSt13bad_exception
+__ZTSSt13runtime_error
+__ZTSSt14overflow_error
+__ZTSSt15underflow_error
+__ZTSSt16invalid_argument
+__ZTSSt20bad_array_new_length
+__ZTSSt8bad_cast
+__ZTSSt9bad_alloc
+__ZTSSt9exception
+__ZTSSt9type_info
+
+# Vtables for std:: exception types
 __ZTVSt10bad_typeid
-__ZTVSt9exception
-__ZNSt10bad_typeidC1Ev
-__ZNSt10bad_typeidC1Ev
-__ZNKSt10bad_typeid4whatEv
-__ZNSt10bad_typeidD1Ev
+__ZTVSt11logic_error
+__ZTVSt11range_error
+__ZTVSt12domain_error
+__ZTVSt12length_error
+__ZTVSt12out_of_range
+__ZTVSt13bad_exception
+__ZTVSt13runtime_error
+__ZTVSt14overflow_error
+__ZTVSt15underflow_error
+__ZTVSt16invalid_argument
+__ZTVSt20bad_array_new_length
 __ZTVSt8bad_cast
-__ZNSt8bad_castC1Ev
-__ZNSt8bad_castC2Ev
-__ZNSt8bad_castD0Ev
-__ZNKSt8bad_cast4whatEv
-__ZNSt8bad_castD1Ev
-__ZNSt8bad_castD2Ev
 __ZTVSt9bad_alloc
-__ZTVSt20bad_array_new_length
-__ZTVSt13bad_exception
-__ZNKSt9exception4whatEv
-__ZNKSt9bad_alloc4whatEv
-__ZNSt9bad_allocC2Ev
-__ZNSt9bad_allocD0Ev
-__ZNSt9bad_allocD2Ev
-__ZNSt9exceptionD0Ev
-__ZNSt20bad_array_new_lengthC1Ev
+__ZTVSt9exception
+__ZTVSt9type_info
+
+# ::what() functions for std:: exception types
+__ZNKSt10bad_typeid4whatEv
+__ZNKSt11logic_error4whatEv
 __ZNKSt13bad_exception4whatEv
-__ZNSt9exceptionD1Ev
+__ZNKSt13runtime_error4whatEv
 __ZNKSt20bad_array_new_length4whatEv
-__ZNSt13bad_exceptionD1Ev
-__ZNSt20bad_array_new_lengthD1Ev
-__ZNSt9exceptionD2Ev
-__ZNSt9type_infoD0Ev
-__ZNSt9type_infoD1Ev
-__ZNSt9type_infoD2Ev
+__ZNKSt8bad_cast4whatEv
+__ZNKSt9bad_alloc4whatEv
+__ZNKSt9exception4whatEv
+
+# Default constructors and destructors for std:: exception types
+__ZNSt10bad_typeidC1Ev
 __ZNSt10bad_typeidC2Ev
 __ZNSt10bad_typeidD0Ev
+__ZNSt10bad_typeidD1Ev
 __ZNSt10bad_typeidD2Ev
-__ZNSt13bad_exceptionD0Ev
-__ZNSt13bad_exceptionD2Ev
-__ZNSt20bad_array_new_lengthC2Ev
-__ZNSt20bad_array_new_lengthD0Ev
-__ZNSt20bad_array_new_lengthD2Ev
-__ZSt10unexpectedv
-__ZTISt10bad_typeid
-__ZTISt8bad_cast
-___cxa_bad_typeid
-___cxa_bad_cast
-__ZTISt9exception
-__ZTISt9type_info
-__ZTISt20bad_array_new_length
-
-__ZNKSt11logic_error4whatEv
 __ZNSt11logic_errorD0Ev
 __ZNSt11logic_errorD1Ev
 __ZNSt11logic_errorD2Ev
-__ZTISt11logic_error
-__ZTSSt11logic_error
-__ZTVSt11logic_error
-
-__ZNKSt13runtime_error4whatEv
-__ZNSt13runtime_errorD0Ev
-__ZNSt13runtime_errorD1Ev
-__ZNSt13runtime_errorD2Ev
-__ZTISt13runtime_error
-__ZTSSt13runtime_error
-__ZTVSt13runtime_error
-
 __ZNSt11range_errorD0Ev
 __ZNSt11range_errorD1Ev
 __ZNSt11range_errorD2Ev
-__ZTISt11range_error
-__ZTSSt11range_error
-__ZTVSt11range_error
-
 __ZNSt12domain_errorD0Ev
 __ZNSt12domain_errorD1Ev
 __ZNSt12domain_errorD2Ev
-__ZTISt12domain_error
-__ZTSSt12domain_error
-__ZTVSt12domain_error
-
 __ZNSt12length_errorD0Ev
 __ZNSt12length_errorD1Ev
 __ZNSt12length_errorD2Ev
-__ZTISt12length_error
-__ZTSSt12length_error
-__ZTVSt12length_error
-
 __ZNSt12out_of_rangeD0Ev
 __ZNSt12out_of_rangeD1Ev
 __ZNSt12out_of_rangeD2Ev
-__ZTISt12out_of_range
-__ZTSSt12out_of_range
-__ZTVSt12out_of_range
-
+__ZNSt13bad_exceptionD0Ev
+__ZNSt13bad_exceptionD1Ev
+__ZNSt13bad_exceptionD2Ev
+__ZNSt13runtime_errorD0Ev
+__ZNSt13runtime_errorD1Ev
+__ZNSt13runtime_errorD2Ev
 __ZNSt14overflow_errorD0Ev
 __ZNSt14overflow_errorD1Ev
 __ZNSt14overflow_errorD2Ev
-__ZTISt14overflow_error
-__ZTSSt14overflow_error
-__ZTVSt14overflow_error
-
 __ZNSt15underflow_errorD0Ev
 __ZNSt15underflow_errorD1Ev
 __ZNSt15underflow_errorD2Ev
-__ZTISt15underflow_error
-__ZTSSt15underflow_error
-__ZTVSt15underflow_error
-
 __ZNSt16invalid_argumentD0Ev
 __ZNSt16invalid_argumentD1Ev
 __ZNSt16invalid_argumentD2Ev
-__ZTISt16invalid_argument
-__ZTSSt16invalid_argument
-__ZTVSt16invalid_argument
-
-__ZTSDi
-__ZTSDn
-__ZTSDs
-__ZTSPDi
-__ZTSPDn
-__ZTSPDs
-__ZTSPKDi
-__ZTSPKDn
-__ZTSPKDs
-
-__ZTSSt8bad_cast
-__ZTSSt9bad_alloc
-__ZTSSt9exception
-__ZTSSt9type_info
-__ZTSSt10bad_typeid
-__ZTSSt13bad_exception
-__ZTSSt20bad_array_new_length
-__ZTVSt9type_info
+__ZNSt20bad_array_new_lengthC1Ev
+__ZNSt20bad_array_new_lengthC2Ev
+__ZNSt20bad_array_new_lengthD0Ev
+__ZNSt20bad_array_new_lengthD1Ev
+__ZNSt20bad_array_new_lengthD2Ev
+__ZNSt8bad_castC1Ev
+__ZNSt8bad_castC2Ev
+__ZNSt8bad_castD0Ev
+__ZNSt8bad_castD1Ev
+__ZNSt8bad_castD2Ev
+__ZNSt9bad_allocC1Ev
+__ZNSt9bad_allocC2Ev
+__ZNSt9bad_allocD0Ev
+__ZNSt9bad_allocD1Ev
+__ZNSt9bad_allocD2Ev
+__ZNSt9exceptionD0Ev
+__ZNSt9exceptionD1Ev
+__ZNSt9exceptionD2Ev
+__ZNSt9type_infoD0Ev
+__ZNSt9type_infoD1Ev
+__ZNSt9type_infoD2Ev
diff --git a/libcxxabi/lib/std-misc.exp b/libcxxabi/lib/std-misc.exp
new file mode 100644
index 000000000000..0ce6a2944398
--- /dev/null
+++ b/libcxxabi/lib/std-misc.exp
@@ -0,0 +1,9 @@
+# Other std:: functions implemented in libc++abi
+__ZSt10unexpectedv
+__ZSt13get_terminatev
+__ZSt13set_terminatePFvvE
+__ZSt14get_unexpectedv
+__ZSt14set_unexpectedPFvvE
+__ZSt15get_new_handlerv
+__ZSt15set_new_handlerPFvvE
+__ZSt9terminatev
diff --git a/libcxxabi/src/CMakeLists.txt b/libcxxabi/src/CMakeLists.txt
index 4198827203fc..0f17ea9184c8 100644
--- a/libcxxabi/src/CMakeLists.txt
+++ b/libcxxabi/src/CMakeLists.txt
@@ -213,31 +213,31 @@ if (LIBCXXABI_ENABLE_SHARED)
   endif()
 
   add_library(cxxabi-reexports INTERFACE)
-
-  # -exported_symbols_list is only available on Apple platforms
-  if (APPLE)
-    function(export_symbols file)
+  function(reexport_symbols file)
+    # -exported_symbols_list is only available on Apple platforms
+    if (APPLE)
       target_link_libraries(cxxabi_shared PRIVATE "-Wl,-exported_symbols_list,${file}")
-    endfunction()
-    function(reexport_symbols file)
-      export_symbols("${file}")
       target_link_libraries(cxxabi-reexports INTERFACE "-Wl,-reexported_symbols_list,${file}")
-    endfunction()
+    endif()
+  endfunction()
 
-    export_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/itanium-base.exp")
+  reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/cxxabiv1.exp")
+  reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/fundamental-types.exp")
+  reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/itanium-base.exp")
+  reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/std-misc.exp")
 
-    if (LIBCXXABI_ENABLE_NEW_DELETE_DEFINITIONS)
-      reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/new-delete.exp")
-    endif()
+  if (LIBCXXABI_ENABLE_NEW_DELETE_DEFINITIONS)
+    reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/new-delete.exp")
+  endif()
 
-    if (LIBCXXABI_ENABLE_EXCEPTIONS)
-      reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/exceptions.exp")
+  if (LIBCXXABI_ENABLE_EXCEPTIONS)
+    reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/itanium-exceptions.exp")
+    reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/std-exceptions.exp")
 
-      if ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "^(armv6|armv7|armv7s)$")
-        reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/personality-sjlj.exp")
-      else()
-        reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/personality-v0.exp")
-      endif()
+    if ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "^(armv6|armv7|armv7s)$")
+      reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/personality-sjlj.exp")
+    else()
+      reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/personality-v0.exp")
     endif()
   endif()
 endif()
diff --git a/libcxxabi/src/demangle/ItaniumDemangle.h b/libcxxabi/src/demangle/ItaniumDemangle.h
index 04755e2be3c5..4a0444d407ea 100644
--- a/libcxxabi/src/demangle/ItaniumDemangle.h
+++ b/libcxxabi/src/demangle/ItaniumDemangle.h
@@ -5541,7 +5541,7 @@ Node *AbstractManglingParser<Alloc, Derived>::parseFloatingLiteral() {
     return nullptr;
   std::string_view Data(First, N);
   for (char C : Data)
-    if (!std::isxdigit(C))
+    if (!(C >= '0' && C <= '9') && !(C >= 'a' && C <= 'f'))
       return nullptr;
   First += N;
   if (!consumeIf('E'))
diff --git a/libcxxabi/test/CMakeLists.txt b/libcxxabi/test/CMakeLists.txt
index 9f95c736d63f..586927189cf1 100644
--- a/libcxxabi/test/CMakeLists.txt
+++ b/libcxxabi/test/CMakeLists.txt
@@ -24,11 +24,6 @@ endif()
 set(AUTO_GEN_COMMENT "## Autogenerated by libcxxabi configuration.\n# Do not edit!")
 set(SERIALIZED_LIT_PARAMS "# Lit parameters serialized here for llvm-lit to pick them up\n")
 
-if (LIBCXXABI_EXECUTOR)
-  message(DEPRECATION "LIBCXXABI_EXECUTOR is deprecated, please add executor=... to LIBCXXABI_TEST_PARAMS")
-  serialize_lit_string_param(SERIALIZED_LIT_PARAMS executor "${LIBCXXABI_EXECUTOR}")
-endif()
-
 if (NOT LIBCXXABI_ENABLE_EXCEPTIONS)
   serialize_lit_param(SERIALIZED_LIT_PARAMS enable_exceptions False)
 endif()
diff --git a/libcxxabi/test/test_demangle.pass.cpp b/libcxxabi/test/test_demangle.pass.cpp
index b7e41099ebfc..88637b84de01 100644
--- a/libcxxabi/test/test_demangle.pass.cpp
+++ b/libcxxabi/test/test_demangle.pass.cpp
@@ -30222,9 +30222,8 @@ struct FPLiteralCase {
      }},
 #endif
 #if LDBL_FP128
-    // This was found by libFuzzer+HWASan on aarch64 Android.
-    {"1\006ILeeeEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE",
-     {"\x6<-0x1.cecececececececececececececep+11983L>"}},
+    // A 32-character FP literal of long double type
+    {"3FooILeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeEE", {"Foo<-0x1.eeeeeeeeeeeeeeeeeeeeeeeeeeeep+12015L>"}},
 #endif
 };
 const unsigned NF = sizeof(fp_literal_cases) / sizeof(fp_literal_cases[0]);
@@ -30238,6 +30237,8 @@ const char* invalid_cases[] =
     "NSoERj5E=Y1[uM:ga",
     "Aon_PmKVPDk7?fg4XP5smMUL6;<WsI_mgbf23cCgsHbT<l8EE\0uVRkNOoXDrgdA4[8IU>Vl<>IL8ayHpiVDDDXTY;^o9;i",
     "_ZNSt16allocator_traitsISaIN4llvm3sys2fs18directory_iteratorEEE9constructIS3_IS3_EEEDTcl12_S_constructfp_fp0_spcl7forwardIT0_Efp1_EEERS4_PT_DpOS7_",
+    "3FooILdaaaaaaaaaaAAAAaaEE",
+    "3FooILdaaaaaaaaaaaaaaEE",
 #if !LDBL_FP80
     "_ZN5test01hIfEEvRAcvjplstT_Le4001a000000000000000E_c",
 #endif
diff --git a/libcxxabi/test/uncaught_exception.pass.cpp b/libcxxabi/test/uncaught_exception.pass.cpp
index eb80fcc14f45..9087059aeba5 100644
--- a/libcxxabi/test/uncaught_exception.pass.cpp
+++ b/libcxxabi/test/uncaught_exception.pass.cpp
@@ -11,6 +11,13 @@
 // This tests that libc++abi still provides __cxa_uncaught_exception() for
 // ABI compatibility, even though the Standard doesn't require it to.
 
+// __cxa_uncaught_exception was not re-exported from libc++ previously. This leads
+// to undefined symbols when linking against a libc++ that re-exports the symbols,
+// but running against a libc++ that doesn't. Fortunately, usage of __cxa_uncaught_exception()
+// in the wild seems to be close to non-existent.
+// XFAIL: stdlib=apple-libc++ && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14|15}}
+// XFAIL: stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(11|12|13|14)([.][0-9]+)?}}
+
 #include <cxxabi.h>
 #include <cassert>
 
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 0bbf43ddf694..a9292b3b1a22 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -2057,8 +2057,9 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
           if (sym->dsoDefined)
             continue;
           if (sym->isUndefined() && !sym->isWeak()) {
-            diagnose("undefined reference due to --no-allow-shlib-undefined: " +
-                     toString(*sym) + "\n>>> referenced by " + toString(file));
+            diagnose("undefined reference: " + toString(*sym) +
+                     "\n>>> referenced by " + toString(file) +
+                     " (disallowed by --no-allow-shlib-undefined)");
           } else if (sym->isDefined() && sym->computeBinding() == STB_LOCAL) {
             diagnose("non-exported symbol '" + toString(*sym) + "' in '" +
                      toString(sym->file) + "' is referenced by DSO '" +
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index a57f60c5eed3..018ceec97f20 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -691,6 +691,8 @@ static PlatformVersion parsePlatformVersion(const Arg *arg) {
           .Cases("tvos-simulator", "8", PLATFORM_TVOSSIMULATOR)
           .Cases("watchos-simulator", "9", PLATFORM_WATCHOSSIMULATOR)
           .Cases("driverkit", "10", PLATFORM_DRIVERKIT)
+          .Cases("xros", "11", PLATFORM_XROS)
+          .Cases("xros-simulator", "12", PLATFORM_XROS_SIMULATOR)
           .Default(PLATFORM_UNKNOWN);
   if (platformVersion.platform == PLATFORM_UNKNOWN)
     error(Twine("malformed platform: ") + platformStr);
@@ -985,6 +987,8 @@ PlatformType macho::removeSimulator(PlatformType platform) {
     return PLATFORM_TVOS;
   case PLATFORM_WATCHOSSIMULATOR:
     return PLATFORM_WATCHOS;
+  case PLATFORM_XROS_SIMULATOR:
+    return PLATFORM_XROS;
   default:
     return platform;
   }
@@ -1001,15 +1005,17 @@ static bool shouldAdhocSignByDefault(Architecture arch, PlatformType platform) {
 
   return platform == PLATFORM_MACOS || platform == PLATFORM_IOSSIMULATOR ||
          platform == PLATFORM_TVOSSIMULATOR ||
-         platform == PLATFORM_WATCHOSSIMULATOR;
+         platform == PLATFORM_WATCHOSSIMULATOR ||
+         platform == PLATFORM_XROS_SIMULATOR;
 }
 
 static bool dataConstDefault(const InputArgList &args) {
-  static const std::array<std::pair<PlatformType, VersionTuple>, 5> minVersion =
+  static const std::array<std::pair<PlatformType, VersionTuple>, 6> minVersion =
       {{{PLATFORM_MACOS, VersionTuple(10, 15)},
         {PLATFORM_IOS, VersionTuple(13, 0)},
         {PLATFORM_TVOS, VersionTuple(13, 0)},
         {PLATFORM_WATCHOS, VersionTuple(6, 0)},
+        {PLATFORM_XROS, VersionTuple(1, 0)},
         {PLATFORM_BRIDGEOS, VersionTuple(4, 0)}}};
   PlatformType platform = removeSimulator(config->platformInfo.target.Platform);
   auto it = llvm::find_if(minVersion,
@@ -1045,11 +1051,12 @@ static bool shouldEmitChainedFixups(const InputArgList &args) {
   bool isRequested = arg != nullptr;
 
   // Version numbers taken from the Xcode 13.3 release notes.
-  static const std::array<std::pair<PlatformType, VersionTuple>, 4> minVersion =
+  static const std::array<std::pair<PlatformType, VersionTuple>, 5> minVersion =
       {{{PLATFORM_MACOS, VersionTuple(11, 0)},
         {PLATFORM_IOS, VersionTuple(13, 4)},
         {PLATFORM_TVOS, VersionTuple(14, 0)},
-        {PLATFORM_WATCHOS, VersionTuple(7, 0)}}};
+        {PLATFORM_WATCHOS, VersionTuple(7, 0)},
+        {PLATFORM_XROS, VersionTuple(1, 0)}}};
   PlatformType platform = removeSimulator(config->platformInfo.target.Platform);
   auto it = llvm::find_if(minVersion,
                           [&](const auto &p) { return p.first == platform; });
@@ -1688,8 +1695,8 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
   if (args.getLastArg(OPT_reproducible))
     config->zeroModTime = true;
 
-  std::array<PlatformType, 3> encryptablePlatforms{
-      PLATFORM_IOS, PLATFORM_WATCHOS, PLATFORM_TVOS};
+  std::array<PlatformType, 4> encryptablePlatforms{
+      PLATFORM_IOS, PLATFORM_WATCHOS, PLATFORM_TVOS, PLATFORM_XROS};
   config->emitEncryptionInfo =
       args.hasFlag(OPT_encryptable, OPT_no_encryption,
                    is_contained(encryptablePlatforms, config->platform()));
diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td
index 01e73b789f9a..a524e4a4c508 100644
--- a/lld/MachO/Options.td
+++ b/lld/MachO/Options.td
@@ -377,7 +377,7 @@ def grp_version : OptionGroup<"version">, HelpText<"VERSION TARGETING">;
 
 def platform_version : MultiArg<["-"], "platform_version", 3>,
     MetaVarName<"<platform> <min_version> <sdk_version>">,
-    HelpText<"Platform (e.g., macos, ios, tvos, watchos, bridgeos, mac-catalyst, ios-sim, tvos-sim, watchos-sim, driverkit) and version numbers">,
+    HelpText<"Platform (e.g., macos, ios, tvos, watchos, xros, bridgeos, mac-catalyst, ios-sim, tvos-sim, watchos-sim, xros-sim, driverkit) and version numbers">,
     Group<grp_version>;
 def sdk_version : Separate<["-"], "sdk_version">,
     HelpText<"This option is undocumented in ld64">,
diff --git a/lld/test/ELF/allow-shlib-undefined.s b/lld/test/ELF/allow-shlib-undefined.s
index 969e87b69eb8..4b7151c8bc0d 100644
--- a/lld/test/ELF/allow-shlib-undefined.s
+++ b/lld/test/ELF/allow-shlib-undefined.s
@@ -43,19 +43,19 @@
 # RUN: ld.lld --gc-sections main.o a.so def.so def-hidden.o --fatal-warnings -o /dev/null
 
 # CHECK-NOT:   error:
-# CHECK:       error: undefined reference due to --no-allow-shlib-undefined: x1{{$}}
-# CHECK-NEXT:  >>> referenced by a.so{{$}}
+# CHECK:       error: undefined reference: x1{{$}}
+# CHECK-NEXT:  >>> referenced by a.so (disallowed by --no-allow-shlib-undefined){{$}}
 # CHECK-NOT:   {{.}}
 
 # CHECK2-NOT:  error:
-# CHECK2:      error: undefined reference due to --no-allow-shlib-undefined: x1
-# CHECK2-NEXT: >>> referenced by a.so
-# CHECK2:      error: undefined reference due to --no-allow-shlib-undefined: x1
-# CHECK2-NEXT: >>> referenced by b.so
+# CHECK2:      error: undefined reference: x1
+# CHECK2-NEXT: >>> referenced by a.so (disallowed by --no-allow-shlib-undefined)
+# CHECK2:      error: undefined reference: x1
+# CHECK2-NEXT: >>> referenced by b.so (disallowed by --no-allow-shlib-undefined)
 # CHECK2-NOT:  {{.}}
 
-# WARN:        warning: undefined reference due to --no-allow-shlib-undefined: x1
-# WARN-NEXT:   >>> referenced by a.so
+# WARN:        warning: undefined reference: x1
+# WARN-NEXT:   >>> referenced by a.so (disallowed by --no-allow-shlib-undefined)
 
 # NONEXPORTED-NOT: error:
 # NONEXPORTED:     error: non-exported symbol 'x1' in 'def-hidden.o' is referenced by DSO 'a.so'
diff --git a/lld/test/ELF/unresolved-symbols.s b/lld/test/ELF/unresolved-symbols.s
index 68fe1e7065a8..91194d376ca8 100644
--- a/lld/test/ELF/unresolved-symbols.s
+++ b/lld/test/ELF/unresolved-symbols.s
@@ -36,7 +36,7 @@
 ## --unresolved-symbols overrides a previous --allow-shlib-undefined.
 # RUN: not ld.lld %t1.o %t.so -o /dev/null --allow-shlib-undefined --unresolved-symbols=ignore-in-object-files 2>&1 | FileCheck %s --check-prefix=SHLIB
 
-# SHLIB: error: undefined reference due to --no-allow-shlib-undefined: undef
+# SHLIB: error: undefined reference: undef
 
 ## Ignoring undefines in shared should produce error for symbol from object.
 # RUN: not ld.lld %t2.o -o /dev/null --unresolved-symbols=ignore-in-shared-libs 2>&1 | \
diff --git a/lld/test/ELF/wrap-shlib-undefined.s b/lld/test/ELF/wrap-shlib-undefined.s
index 96c4629cb497..0692ae30e889 100644
--- a/lld/test/ELF/wrap-shlib-undefined.s
+++ b/lld/test/ELF/wrap-shlib-undefined.s
@@ -17,7 +17,7 @@
 
 ## --no-allow-shlib-undefined errors because __real_foo is not defined.
 # RUN: not ld.lld %t/main.o %t/bar.so -o /dev/null 2>&1 | FileCheck --check-prefix=ERR %s
-# ERR: error: undefined reference due to --no-allow-shlib-undefined: __real_foo
+# ERR: error: undefined reference: __real_foo
 
 ## --wrap=foo defines __real_foo.
 # RUN: ld.lld %t/main.o %t/bar.so --wrap=foo -o %t2
diff --git a/lld/test/MachO/lc-build-version.s b/lld/test/MachO/lc-build-version.s
index 7b78f803428a..1fd7078919b1 100644
--- a/lld/test/MachO/lc-build-version.s
+++ b/lld/test/MachO/lc-build-version.s
@@ -64,6 +64,13 @@
 
 # WATCHOS-4-0: cmd LC_VERSION_MIN_WATCHOS
 
+# RUN: %no-arg-lld -arch x86_64 -platform_version xros 1.0 1.1 -o %t.xros-1-0 %t.o
+# RUN: llvm-objdump --macho --all-headers %t.xros-1-0 | FileCheck %s --check-prefix=XROS-1-0
+# RUN: %no-arg-lld -arch x86_64 -platform_version xros-simulator 1.0 1.1 -o %t.xros-sim-1-0 %t.o
+# RUN: llvm-objdump --macho --all-headers %t.xros-sim-1-0 | FileCheck %s --check-prefix=XROS-1-0
+
+# XROS-1-0: cmd LC_BUILD_VERSION
+
 .text
 .global _main
 _main:
diff --git a/lld/test/MachO/platform-version.s b/lld/test/MachO/platform-version.s
index 047aea02fcde..57fbae62b2ff 100644
--- a/lld/test/MachO/platform-version.s
+++ b/lld/test/MachO/platform-version.s
@@ -55,7 +55,7 @@
 # RUN:        -platform_version 0 1 5 \
 # RUN:     | FileCheck --check-prefix=FAIL-PLATFORM %s
 # RUN: not %no-arg-lld -arch x86_64 -o %t %t.o 2>&1 \
-# RUN:        -platform_version 11 1 5 \
+# RUN:        -platform_version 13 1 5 \
 # RUN:     | FileCheck --check-prefix=FAIL-PLATFORM %s
 # FAIL-PLATFORM: malformed platform: {{.*}}
 # FAIL-PLATFORM-NOT: malformed {{minimum|sdk}} version: {{.*}}
diff --git a/lldb/bindings/interface/SBTargetExtensions.i b/lldb/bindings/interface/SBTargetExtensions.i
index c80dadfc0c5c..d756a351a810 100644
--- a/lldb/bindings/interface/SBTargetExtensions.i
+++ b/lldb/bindings/interface/SBTargetExtensions.i
@@ -172,7 +172,7 @@ STRING_EXTENSION_LEVEL_OUTSIDE(SBTarget, lldb::eDescriptionLevelBrief)
             '''An accessor function that returns a list() that contains all watchpoints in a lldb.SBtarget object.'''
             watchpoints = []
             for idx in range(self.GetNumWatchpoints()):
-                bkpts.append(self.GetWatchpointAtIndex(idx))
+                watchpoints.append(self.GetWatchpointAtIndex(idx))
             return watchpoints
 
         modules = property(get_modules_array, None, doc='''A read only property that returns a list() of lldb.SBModule objects contained in this target. This list is a list all modules that the target currently is tracking (the main executable and all dependent shared libraries).''')
diff --git a/lldb/include/lldb/Interpreter/CommandObject.h b/lldb/include/lldb/Interpreter/CommandObject.h
index a326c6dc38a3..a641a468b49d 100644
--- a/lldb/include/lldb/Interpreter/CommandObject.h
+++ b/lldb/include/lldb/Interpreter/CommandObject.h
@@ -207,6 +207,20 @@ public:
   static const ArgumentTableEntry *
   FindArgumentDataByType(lldb::CommandArgumentType arg_type);
 
+  // Sets the argument list for this command to one homogenous argument type,
+  // with the repeat specified.
+  void AddSimpleArgumentList(
+      lldb::CommandArgumentType arg_type,
+      ArgumentRepetitionType repetition_type = eArgRepeatPlain);
+
+  // Helper function to set BP IDs or ID ranges as the command argument data
+  // for this command.
+  // This used to just populate an entry you could add to, but that was never
+  // used.  If we ever need that we can take optional extra args here.
+  // Use this to define a simple argument list:
+  enum IDType { eBreakpointArgs = 0, eWatchpointArgs = 1 };
+  void AddIDsArgumentData(IDType type);
+
   int GetNumArgumentEntries();
 
   CommandArgumentEntry *GetArgumentEntryAtIndex(int idx);
@@ -391,12 +405,6 @@ protected:
   lldb_private::CommandOverrideCallbackWithResult m_command_override_callback;
   void *m_command_override_baton;
   bool m_is_user_command = false;
-
-  // Helper function to populate IDs or ID ranges as the command argument data
-  // to the specified command argument entry.
-  static void AddIDsArgumentData(CommandArgumentEntry &arg,
-                                 lldb::CommandArgumentType ID,
-                                 lldb::CommandArgumentType IDRange);
 };
 
 class CommandObjectParsed : public CommandObject {
diff --git a/lldb/include/lldb/Interpreter/Options.h b/lldb/include/lldb/Interpreter/Options.h
index 18a87e49deee..9a6a17c2793f 100644
--- a/lldb/include/lldb/Interpreter/Options.h
+++ b/lldb/include/lldb/Interpreter/Options.h
@@ -368,6 +368,8 @@ static constexpr llvm::StringLiteral g_bool_parsing_error_message =
     "Failed to parse as boolean";
 static constexpr llvm::StringLiteral g_int_parsing_error_message =
     "Failed to parse as integer";
+static constexpr llvm::StringLiteral g_language_parsing_error_message =
+    "Unknown language";
 
 } // namespace lldb_private
 
diff --git a/lldb/include/lldb/Utility/ArchSpec.h b/lldb/include/lldb/Utility/ArchSpec.h
index a226a3a5a9b7..50830b889b91 100644
--- a/lldb/include/lldb/Utility/ArchSpec.h
+++ b/lldb/include/lldb/Utility/ArchSpec.h
@@ -505,11 +505,6 @@ public:
 
   bool IsFullySpecifiedTriple() const;
 
-  void PiecewiseTripleCompare(const ArchSpec &other, bool &arch_different,
-                              bool &vendor_different, bool &os_different,
-                              bool &os_version_different,
-                              bool &env_different) const;
-
   /// Detect whether this architecture uses thumb code exclusively
   ///
   /// Some embedded ARM chips (e.g. the ARM Cortex M0-7 line) can only execute
diff --git a/lldb/source/Commands/CommandObjectApropos.cpp b/lldb/source/Commands/CommandObjectApropos.cpp
index 88c214d4fc56..d663f2bd923f 100644
--- a/lldb/source/Commands/CommandObjectApropos.cpp
+++ b/lldb/source/Commands/CommandObjectApropos.cpp
@@ -21,19 +21,7 @@ CommandObjectApropos::CommandObjectApropos(CommandInterpreter &interpreter)
     : CommandObjectParsed(
           interpreter, "apropos",
           "List debugger commands related to a word or subject.", nullptr) {
-  CommandArgumentEntry arg;
-  CommandArgumentData search_word_arg;
-
-  // Define the first (and only) variant of this arg.
-  search_word_arg.arg_type = eArgTypeSearchWord;
-  search_word_arg.arg_repetition = eArgRepeatPlain;
-
-  // There is only one variant this argument could be; put it into the argument
-  // entry.
-  arg.push_back(search_word_arg);
-
-  // Push the data for the first argument into the m_arguments vector.
-  m_arguments.push_back(arg);
+  AddSimpleArgumentList(eArgTypeSearchWord);
 }
 
 CommandObjectApropos::~CommandObjectApropos() = default;
diff --git a/lldb/source/Commands/CommandObjectBreakpoint.cpp b/lldb/source/Commands/CommandObjectBreakpoint.cpp
index fc2217608a0b..fbece865f113 100644
--- a/lldb/source/Commands/CommandObjectBreakpoint.cpp
+++ b/lldb/source/Commands/CommandObjectBreakpoint.cpp
@@ -266,6 +266,8 @@ public:
       Status error;
       const int short_option =
           g_breakpoint_set_options[option_idx].short_option;
+      const char *long_option =
+          g_breakpoint_set_options[option_idx].long_option;
 
       switch (short_option) {
       case 'a': {
@@ -284,13 +286,15 @@ public:
 
       case 'u':
         if (option_arg.getAsInteger(0, m_column))
-          error.SetErrorStringWithFormat("invalid column number: %s",
-                                         option_arg.str().c_str());
+          error =
+              CreateOptionParsingError(option_arg, short_option, long_option,
+                                       g_int_parsing_error_message);
         break;
 
       case 'E': {
         LanguageType language = Language::GetLanguageTypeFromString(option_arg);
 
+        llvm::StringRef error_context;
         switch (language) {
         case eLanguageTypeC89:
         case eLanguageTypeC:
@@ -308,19 +312,18 @@ public:
           m_exception_language = eLanguageTypeObjC;
           break;
         case eLanguageTypeObjC_plus_plus:
-          error.SetErrorStringWithFormat(
-              "Set exception breakpoints separately for c++ and objective-c");
+          error_context =
+              "Set exception breakpoints separately for c++ and objective-c";
           break;
         case eLanguageTypeUnknown:
-          error.SetErrorStringWithFormat(
-              "Unknown language type: '%s' for exception breakpoint",
-              option_arg.str().c_str());
+          error_context = "Unknown language type for exception breakpoint";
           break;
         default:
-          error.SetErrorStringWithFormat(
-              "Unsupported language type: '%s' for exception breakpoint",
-              option_arg.str().c_str());
+          error_context = "Unsupported language type for exception breakpoint";
         }
+        if (!error_context.empty())
+          error = CreateOptionParsingError(option_arg, short_option,
+                                           long_option, error_context);
       } break;
 
       case 'f':
@@ -336,9 +339,9 @@ public:
         bool success;
         m_catch_bp = OptionArgParser::ToBoolean(option_arg, true, &success);
         if (!success)
-          error.SetErrorStringWithFormat(
-              "Invalid boolean value for on-catch option: '%s'",
-              option_arg.str().c_str());
+          error =
+              CreateOptionParsingError(option_arg, short_option, long_option,
+                                       g_bool_parsing_error_message);
       } break;
 
       case 'H':
@@ -355,23 +358,24 @@ public:
           m_skip_prologue = eLazyBoolNo;
 
         if (!success)
-          error.SetErrorStringWithFormat(
-              "Invalid boolean value for skip prologue option: '%s'",
-              option_arg.str().c_str());
+          error =
+              CreateOptionParsingError(option_arg, short_option, long_option,
+                                       g_bool_parsing_error_message);
       } break;
 
       case 'l':
         if (option_arg.getAsInteger(0, m_line_num))
-          error.SetErrorStringWithFormat("invalid line number: %s.",
-                                         option_arg.str().c_str());
+          error =
+              CreateOptionParsingError(option_arg, short_option, long_option,
+                                       g_int_parsing_error_message);
         break;
 
       case 'L':
         m_language = Language::GetLanguageTypeFromString(option_arg);
         if (m_language == eLanguageTypeUnknown)
-          error.SetErrorStringWithFormat(
-              "Unknown language type: '%s' for breakpoint",
-              option_arg.str().c_str());
+          error =
+              CreateOptionParsingError(option_arg, short_option, long_option,
+                                       g_language_parsing_error_message);
         break;
 
       case 'm': {
@@ -384,9 +388,9 @@ public:
           m_move_to_nearest_code = eLazyBoolNo;
 
         if (!success)
-          error.SetErrorStringWithFormat(
-              "Invalid boolean value for move-to-nearest-code option: '%s'",
-              option_arg.str().c_str());
+          error =
+              CreateOptionParsingError(option_arg, short_option, long_option,
+                                       g_bool_parsing_error_message);
         break;
       }
 
@@ -404,8 +408,8 @@ public:
         if (BreakpointID::StringIsBreakpointName(option_arg, error))
           m_breakpoint_names.push_back(std::string(option_arg));
         else
-          error.SetErrorStringWithFormat("Invalid breakpoint name: %s",
-                                         option_arg.str().c_str());
+          error = CreateOptionParsingError(
+              option_arg, short_option, long_option, "Invalid breakpoint name");
         break;
       }
 
@@ -443,9 +447,9 @@ public:
         bool success;
         m_throw_bp = OptionArgParser::ToBoolean(option_arg, true, &success);
         if (!success)
-          error.SetErrorStringWithFormat(
-              "Invalid boolean value for on-throw option: '%s'",
-              option_arg.str().c_str());
+          error =
+              CreateOptionParsingError(option_arg, short_option, long_option,
+                                       g_bool_parsing_error_message);
       } break;
 
       case 'X':
@@ -457,9 +461,8 @@ public:
         OptionValueFileColonLine value;
         Status fcl_err = value.SetValueFromString(option_arg);
         if (!fcl_err.Success()) {
-          error.SetErrorStringWithFormat(
-              "Invalid value for file:line specifier: %s",
-              fcl_err.AsCString());
+          error = CreateOptionParsingError(option_arg, short_option,
+                                           long_option, fcl_err.AsCString());
         } else {
           m_filenames.AppendIfUnique(value.GetFileSpec());
           m_line_num = value.GetLineNumber();
@@ -810,12 +813,7 @@ public:
                             "With the exception of -e, -d and -i, passing an "
                             "empty argument clears the modification.",
                             nullptr) {
-    CommandArgumentEntry arg;
-    CommandObject::AddIDsArgumentData(arg, eArgTypeBreakpointID,
-                                      eArgTypeBreakpointIDRange);
-    // Add the entry for the first argument for this command to the object's
-    // arguments vector.
-    m_arguments.push_back(arg);
+    CommandObject::AddIDsArgumentData(eBreakpointArgs);
 
     m_options.Append(&m_bp_opts,
                      LLDB_OPT_SET_1 | LLDB_OPT_SET_2 | LLDB_OPT_SET_3,
@@ -887,12 +885,7 @@ public:
                             "Enable the specified disabled breakpoint(s). If "
                             "no breakpoints are specified, enable all of them.",
                             nullptr) {
-    CommandArgumentEntry arg;
-    CommandObject::AddIDsArgumentData(arg, eArgTypeBreakpointID,
-                                      eArgTypeBreakpointIDRange);
-    // Add the entry for the first argument for this command to the object's
-    // arguments vector.
-    m_arguments.push_back(arg);
+    CommandObject::AddIDsArgumentData(eBreakpointArgs);
   }
 
   ~CommandObjectBreakpointEnable() override = default;
@@ -999,12 +992,7 @@ execution will NOT stop at location 1.1.  To achieve that, type:
         "The first command disables all locations for breakpoint 1, \
 the second re-enables the first location.");
 
-    CommandArgumentEntry arg;
-    CommandObject::AddIDsArgumentData(arg, eArgTypeBreakpointID,
-                                      eArgTypeBreakpointIDRange);
-    // Add the entry for the first argument for this command to the object's
-    // arguments vector.
-    m_arguments.push_back(arg);
+    CommandObject::AddIDsArgumentData(eBreakpointArgs);
   }
 
   ~CommandObjectBreakpointDisable() override = default;
@@ -1095,15 +1083,7 @@ public:
     CommandArgumentData bp_id_arg;
 
     // Define the first (and only) variant of this arg.
-    bp_id_arg.arg_type = eArgTypeBreakpointID;
-    bp_id_arg.arg_repetition = eArgRepeatOptional;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(bp_id_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeBreakpointID, eArgRepeatOptional);
   }
 
   ~CommandObjectBreakpointList() override = default;
@@ -1369,12 +1349,7 @@ public:
                             "Delete the specified breakpoint(s).  If no "
                             "breakpoints are specified, delete them all.",
                             nullptr) {
-    CommandArgumentEntry arg;
-    CommandObject::AddIDsArgumentData(arg, eArgTypeBreakpointID,
-                                      eArgTypeBreakpointIDRange);
-    // Add the entry for the first argument for this command to the object's
-    // arguments vector.
-    m_arguments.push_back(arg);
+    CommandObject::AddIDsArgumentData(eBreakpointArgs);
   }
 
   ~CommandObjectBreakpointDelete() override = default;
@@ -1557,6 +1532,7 @@ public:
                         ExecutionContext *execution_context) override {
     Status error;
     const int short_option = g_breakpoint_name_options[option_idx].short_option;
+    const char *long_option = g_breakpoint_name_options[option_idx].long_option;
 
     switch (short_option) {
     case 'N':
@@ -1566,15 +1542,13 @@ public:
       break;
     case 'B':
       if (m_breakpoint.SetValueFromString(option_arg).Fail())
-        error.SetErrorStringWithFormat(
-            "unrecognized value \"%s\" for breakpoint",
-            option_arg.str().c_str());
+        error = CreateOptionParsingError(option_arg, short_option, long_option,
+                                         g_int_parsing_error_message);
       break;
     case 'D':
       if (m_use_dummy.SetValueFromString(option_arg).Fail())
-        error.SetErrorStringWithFormat(
-            "unrecognized value \"%s\" for use-dummy",
-            option_arg.str().c_str());
+        error = CreateOptionParsingError(option_arg, short_option, long_option,
+                                         g_bool_parsing_error_message);
       break;
     case 'H':
       m_help_string.SetValueFromString(option_arg);
@@ -1617,6 +1591,8 @@ public:
     Status error;
     const int short_option =
         g_breakpoint_access_options[option_idx].short_option;
+    const char *long_option =
+        g_breakpoint_access_options[option_idx].long_option;
 
     switch (short_option) {
     case 'L': {
@@ -1625,9 +1601,8 @@ public:
       if (success) {
         m_permissions.SetAllowList(value);
       } else
-        error.SetErrorStringWithFormat(
-            "invalid boolean value '%s' passed for -L option",
-            option_arg.str().c_str());
+        error = CreateOptionParsingError(option_arg, short_option, long_option,
+                                         g_bool_parsing_error_message);
     } break;
     case 'A': {
       bool value, success;
@@ -1635,9 +1610,8 @@ public:
       if (success) {
         m_permissions.SetAllowDisable(value);
       } else
-        error.SetErrorStringWithFormat(
-            "invalid boolean value '%s' passed for -L option",
-            option_arg.str().c_str());
+        error = CreateOptionParsingError(option_arg, short_option, long_option,
+                                         g_bool_parsing_error_message);
     } break;
     case 'D': {
       bool value, success;
@@ -1645,9 +1619,8 @@ public:
       if (success) {
         m_permissions.SetAllowDelete(value);
       } else
-        error.SetErrorStringWithFormat(
-            "invalid boolean value '%s' passed for -L option",
-            option_arg.str().c_str());
+        error = CreateOptionParsingError(option_arg, short_option, long_option,
+                                         g_bool_parsing_error_message);
     } break;
     default:
       llvm_unreachable("Unimplemented option");
@@ -1676,14 +1649,7 @@ public:
             "on the name.",
             "breakpoint name configure <command-options> "
             "<breakpoint-name-list>") {
-    // Create the first variant for the first (and only) argument for this
-    // command.
-    CommandArgumentEntry arg1;
-    CommandArgumentData id_arg;
-    id_arg.arg_type = eArgTypeBreakpointName;
-    id_arg.arg_repetition = eArgRepeatOptional;
-    arg1.push_back(id_arg);
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeBreakpointName, eArgRepeatOptional);
 
     m_option_group.Append(&m_bp_opts, LLDB_OPT_SET_ALL, LLDB_OPT_SET_1);
     m_option_group.Append(&m_access_options, LLDB_OPT_SET_ALL,
@@ -1769,14 +1735,7 @@ public:
       : CommandObjectParsed(
             interpreter, "add", "Add a name to the breakpoints provided.",
             "breakpoint name add <command-options> <breakpoint-id-list>") {
-    // Create the first variant for the first (and only) argument for this
-    // command.
-    CommandArgumentEntry arg1;
-    CommandArgumentData id_arg;
-    id_arg.arg_type = eArgTypeBreakpointID;
-    id_arg.arg_repetition = eArgRepeatOptional;
-    arg1.push_back(id_arg);
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeBreakpointID, eArgRepeatOptional);
 
     m_option_group.Append(&m_name_options, LLDB_OPT_SET_1, LLDB_OPT_SET_ALL);
     m_option_group.Finalize();
@@ -1850,14 +1809,7 @@ public:
             interpreter, "delete",
             "Delete a name from the breakpoints provided.",
             "breakpoint name delete <command-options> <breakpoint-id-list>") {
-    // Create the first variant for the first (and only) argument for this
-    // command.
-    CommandArgumentEntry arg1;
-    CommandArgumentData id_arg;
-    id_arg.arg_type = eArgTypeBreakpointID;
-    id_arg.arg_repetition = eArgRepeatOptional;
-    arg1.push_back(id_arg);
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeBreakpointID, eArgRepeatOptional);
 
     m_option_group.Append(&m_name_options, LLDB_OPT_SET_1, LLDB_OPT_SET_ALL);
     m_option_group.Finalize();
@@ -2141,6 +2093,8 @@ public:
                           ExecutionContext *execution_context) override {
       Status error;
       const int short_option = m_getopt_table[option_idx].val;
+      const char *long_option =
+          m_getopt_table[option_idx].definition->long_option;
 
       switch (short_option) {
       case 'f':
@@ -2150,8 +2104,8 @@ public:
         Status name_error;
         if (!BreakpointID::StringIsBreakpointName(llvm::StringRef(option_arg),
                                                   name_error)) {
-          error.SetErrorStringWithFormat("Invalid breakpoint name: %s",
-                                         name_error.AsCString());
+          error = CreateOptionParsingError(option_arg, short_option,
+                                           long_option, name_error.AsCString());
         }
         m_names.push_back(std::string(option_arg));
         break;
@@ -2305,12 +2259,7 @@ public:
                             "be read in with \"breakpoint read\".  "
                             "If given no arguments, writes all breakpoints.",
                             nullptr) {
-    CommandArgumentEntry arg;
-    CommandObject::AddIDsArgumentData(arg, eArgTypeBreakpointID,
-                                      eArgTypeBreakpointIDRange);
-    // Add the entry for the first argument for this command to the object's
-    // arguments vector.
-    m_arguments.push_back(arg);
+    CommandObject::AddIDsArgumentData(eBreakpointArgs);
   }
 
   ~CommandObjectBreakpointWrite() override = default;
diff --git a/lldb/source/Commands/CommandObjectBreakpointCommand.cpp b/lldb/source/Commands/CommandObjectBreakpointCommand.cpp
index fefafcd94546..6ebe6e8a3557 100644
--- a/lldb/source/Commands/CommandObjectBreakpointCommand.cpp
+++ b/lldb/source/Commands/CommandObjectBreakpointCommand.cpp
@@ -185,19 +185,7 @@ are no syntax errors may indicate that a function was declared but never called.
                          LLDB_OPT_SET_2);
     m_all_options.Finalize();
 
-    CommandArgumentEntry arg;
-    CommandArgumentData bp_id_arg;
-
-    // Define the first (and only) variant of this arg.
-    bp_id_arg.arg_type = eArgTypeBreakpointID;
-    bp_id_arg.arg_repetition = eArgRepeatOptional;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(bp_id_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeBreakpointID, eArgRepeatOptional);
   }
 
   ~CommandObjectBreakpointCommandAdd() override = default;
@@ -449,19 +437,7 @@ public:
       : CommandObjectParsed(interpreter, "delete",
                             "Delete the set of commands from a breakpoint.",
                             nullptr) {
-    CommandArgumentEntry arg;
-    CommandArgumentData bp_id_arg;
-
-    // Define the first (and only) variant of this arg.
-    bp_id_arg.arg_type = eArgTypeBreakpointID;
-    bp_id_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(bp_id_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeBreakpointID);
   }
 
   ~CommandObjectBreakpointCommandDelete() override = default;
@@ -565,19 +541,7 @@ public:
                             "List the script or set of commands to be "
                             "executed when the breakpoint is hit.",
                             nullptr, eCommandRequiresTarget) {
-    CommandArgumentEntry arg;
-    CommandArgumentData bp_id_arg;
-
-    // Define the first (and only) variant of this arg.
-    bp_id_arg.arg_type = eArgTypeBreakpointID;
-    bp_id_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(bp_id_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeBreakpointID);
   }
 
   ~CommandObjectBreakpointCommandList() override = default;
diff --git a/lldb/source/Commands/CommandObjectCommands.cpp b/lldb/source/Commands/CommandObjectCommands.cpp
index 7c459bdaf380..f4903e373b08 100644
--- a/lldb/source/Commands/CommandObjectCommands.cpp
+++ b/lldb/source/Commands/CommandObjectCommands.cpp
@@ -41,19 +41,7 @@ public:
             interpreter, "command source",
             "Read and execute LLDB commands from the file <filename>.",
             nullptr) {
-    CommandArgumentEntry arg;
-    CommandArgumentData file_arg;
-
-    // Define the first (and only) variant of this arg.
-    file_arg.arg_type = eArgTypeFilename;
-    file_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(file_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeFilename);
   }
 
   ~CommandObjectCommandsSource() override = default;
@@ -614,19 +602,7 @@ public:
             interpreter, "command unalias",
             "Delete one or more custom commands defined by 'command alias'.",
             nullptr) {
-    CommandArgumentEntry arg;
-    CommandArgumentData alias_arg;
-
-    // Define the first (and only) variant of this arg.
-    alias_arg.arg_type = eArgTypeAliasName;
-    alias_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(alias_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeAliasName);
   }
 
   ~CommandObjectCommandsUnalias() override = default;
@@ -701,19 +677,7 @@ public:
             interpreter, "command delete",
             "Delete one or more custom commands defined by 'command regex'.",
             nullptr) {
-    CommandArgumentEntry arg;
-    CommandArgumentData alias_arg;
-
-    // Define the first (and only) variant of this arg.
-    alias_arg.arg_type = eArgTypeCommandName;
-    alias_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(alias_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeCommandName);
   }
 
   ~CommandObjectCommandsDelete() override = default;
@@ -815,8 +779,7 @@ a number follows 'f':"
         R"(
 
     (lldb) command regex f s/^$/finish/ 's/([0-9]+)/frame select %1/')");
-    CommandArgumentData thread_arg{eArgTypeSEDStylePair, eArgRepeatOptional};
-    m_arguments.push_back({thread_arg});
+    AddSimpleArgumentList(eArgTypeSEDStylePair, eArgRepeatOptional);
   }
 
   ~CommandObjectCommandsAddRegex() override = default;
@@ -1944,19 +1907,7 @@ public:
   CommandObjectCommandsScriptImport(CommandInterpreter &interpreter)
       : CommandObjectParsed(interpreter, "command script import",
                             "Import a scripting module in LLDB.", nullptr) {
-    CommandArgumentEntry arg1;
-    CommandArgumentData cmd_arg;
-
-    // Define the first (and only) variant of this arg.
-    cmd_arg.arg_type = eArgTypeFilename;
-    cmd_arg.arg_repetition = eArgRepeatPlus;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg1.push_back(cmd_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeFilename, eArgRepeatPlus);
   }
 
   ~CommandObjectCommandsScriptImport() override = default;
@@ -2066,20 +2017,7 @@ public:
                             "command, and the last element will be the new "
                             "command name."),
         IOHandlerDelegateMultiline("DONE") {
-    CommandArgumentEntry arg1;
-    CommandArgumentData cmd_arg;
-
-    // This is one or more command names, which form the path to the command
-    // you want to add.
-    cmd_arg.arg_type = eArgTypeCommand;
-    cmd_arg.arg_repetition = eArgRepeatPlus;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg1.push_back(cmd_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeCommand, eArgRepeatPlus);
   }
 
   ~CommandObjectCommandsScriptAdd() override = default;
@@ -2400,20 +2338,7 @@ public:
             interpreter, "command script delete",
             "Delete a scripted command by specifying the path to the command.",
             nullptr) {
-    CommandArgumentEntry arg1;
-    CommandArgumentData cmd_arg;
-
-    // This is a list of command names forming the path to the command
-    // to be deleted.
-    cmd_arg.arg_type = eArgTypeCommand;
-    cmd_arg.arg_repetition = eArgRepeatPlus;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg1.push_back(cmd_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeCommand, eArgRepeatPlus);
   }
 
   ~CommandObjectCommandsScriptDelete() override = default;
@@ -2549,20 +2474,7 @@ public:
             "Add a container command to lldb.  Adding to built-"
             "in container commands is not allowed.",
             "command container add [[path1]...] container-name") {
-    CommandArgumentEntry arg1;
-    CommandArgumentData cmd_arg;
-
-    // This is one or more command names, which form the path to the command
-    // you want to add.
-    cmd_arg.arg_type = eArgTypeCommand;
-    cmd_arg.arg_repetition = eArgRepeatPlus;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg1.push_back(cmd_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeCommand, eArgRepeatPlus);
   }
 
   ~CommandObjectCommandsContainerAdd() override = default;
@@ -2690,20 +2602,7 @@ public:
             "Delete a container command previously added to "
             "lldb.",
             "command container delete [[path1] ...] container-cmd") {
-    CommandArgumentEntry arg1;
-    CommandArgumentData cmd_arg;
-
-    // This is one or more command names, which form the path to the command
-    // you want to add.
-    cmd_arg.arg_type = eArgTypeCommand;
-    cmd_arg.arg_repetition = eArgRepeatPlus;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg1.push_back(cmd_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeCommand, eArgRepeatPlus);
   }
 
   ~CommandObjectCommandsContainerDelete() override = default;
diff --git a/lldb/source/Commands/CommandObjectDWIMPrint.cpp b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
index fb2cc106ffd2..b183cb423111 100644
--- a/lldb/source/Commands/CommandObjectDWIMPrint.cpp
+++ b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
@@ -37,8 +37,7 @@ CommandObjectDWIMPrint::CommandObjectDWIMPrint(CommandInterpreter &interpreter)
                        "dwim-print [<variable-name> | <expression>]",
                        eCommandProcessMustBePaused | eCommandTryTargetAPILock) {
 
-  CommandArgumentData var_name_arg(eArgTypeVarName, eArgRepeatPlain);
-  m_arguments.push_back({var_name_arg});
+  AddSimpleArgumentList(eArgTypeVarName);
 
   m_option_group.Append(&m_format_options,
                         OptionGroupFormat::OPTION_GROUP_FORMAT |
diff --git a/lldb/source/Commands/CommandObjectExpression.cpp b/lldb/source/Commands/CommandObjectExpression.cpp
index 3a2dc11e1e71..2319ddd3c80a 100644
--- a/lldb/source/Commands/CommandObjectExpression.cpp
+++ b/lldb/source/Commands/CommandObjectExpression.cpp
@@ -311,19 +311,7 @@ Examples:
     expr unsigned int $foo = 5
     expr char c[] = \"foo\"; c[0])");
 
-  CommandArgumentEntry arg;
-  CommandArgumentData expression_arg;
-
-  // Define the first (and only) variant of this arg.
-  expression_arg.arg_type = eArgTypeExpression;
-  expression_arg.arg_repetition = eArgRepeatPlain;
-
-  // There is only one variant this argument could be; put it into the argument
-  // entry.
-  arg.push_back(expression_arg);
-
-  // Push the data for the first argument into the m_arguments vector.
-  m_arguments.push_back(arg);
+  AddSimpleArgumentList(eArgTypeExpression);
 
   // Add the "--format" and "--gdb-format"
   m_option_group.Append(&m_format_options,
diff --git a/lldb/source/Commands/CommandObjectFrame.cpp b/lldb/source/Commands/CommandObjectFrame.cpp
index f092d54ffe99..b1d060b3c6cf 100644
--- a/lldb/source/Commands/CommandObjectFrame.cpp
+++ b/lldb/source/Commands/CommandObjectFrame.cpp
@@ -113,19 +113,7 @@ public:
                             eCommandRequiresThread | eCommandTryTargetAPILock |
                                 eCommandProcessMustBeLaunched |
                                 eCommandProcessMustBePaused) {
-    CommandArgumentEntry arg;
-    CommandArgumentData index_arg;
-
-    // Define the first (and only) variant of this arg.
-    index_arg.arg_type = eArgTypeFrameIndex;
-    index_arg.arg_repetition = eArgRepeatOptional;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(index_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeFrameIndex, eArgRepeatOptional);
   }
 
   ~CommandObjectFrameDiagnose() override = default;
@@ -269,19 +257,7 @@ public:
                             eCommandRequiresThread | eCommandTryTargetAPILock |
                                 eCommandProcessMustBeLaunched |
                                 eCommandProcessMustBePaused) {
-    CommandArgumentEntry arg;
-    CommandArgumentData index_arg;
-
-    // Define the first (and only) variant of this arg.
-    index_arg.arg_type = eArgTypeFrameIndex;
-    index_arg.arg_repetition = eArgRepeatOptional;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(index_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeFrameIndex, eArgRepeatOptional);
   }
 
   ~CommandObjectFrameSelect() override = default;
@@ -409,19 +385,7 @@ However, 'frame variable' is more efficient, since it uses debug information and
 memory reads directly, rather than parsing and evaluating an expression, which
 may even involve JITing and running code in the target program.)");
 
-    CommandArgumentEntry arg;
-    CommandArgumentData var_name_arg;
-
-    // Define the first (and only) variant of this arg.
-    var_name_arg.arg_type = eArgTypeVarName;
-    var_name_arg.arg_repetition = eArgRepeatStar;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(var_name_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeVarName, eArgRepeatStar);
 
     m_option_group.Append(&m_option_variable, LLDB_OPT_SET_ALL, LLDB_OPT_SET_1);
     m_option_group.Append(&m_option_format,
@@ -939,8 +903,7 @@ public:
       : CommandObjectParsed(interpreter, "frame recognizer delete",
                             "Delete an existing frame recognizer by id.",
                             nullptr) {
-    CommandArgumentData thread_arg{eArgTypeRecognizerID, eArgRepeatPlain};
-    m_arguments.push_back({thread_arg});
+    AddSimpleArgumentList(eArgTypeRecognizerID);
   }
 
   ~CommandObjectFrameRecognizerDelete() override = default;
@@ -1065,19 +1028,7 @@ public:
             interpreter, "frame recognizer info",
             "Show which frame recognizer is applied a stack frame (if any).",
             nullptr) {
-    CommandArgumentEntry arg;
-    CommandArgumentData index_arg;
-
-    // Define the first (and only) variant of this arg.
-    index_arg.arg_type = eArgTypeFrameIndex;
-    index_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(index_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeFrameIndex);
   }
 
   ~CommandObjectFrameRecognizerInfo() override = default;
diff --git a/lldb/source/Commands/CommandObjectHelp.cpp b/lldb/source/Commands/CommandObjectHelp.cpp
index ddb006e52d2c..f1dbd03fe97c 100644
--- a/lldb/source/Commands/CommandObjectHelp.cpp
+++ b/lldb/source/Commands/CommandObjectHelp.cpp
@@ -48,20 +48,9 @@ CommandObjectHelp::CommandObjectHelp(CommandInterpreter &interpreter)
                           "commands, or give details "
                           "about a specific command.",
                           "help [<cmd-name>]") {
-  CommandArgumentEntry arg;
-  CommandArgumentData command_arg;
-
   // A list of command names forming a path to the command we want help on.
   // No names is allowed - in which case we dump the top-level help.
-  command_arg.arg_type = eArgTypeCommand;
-  command_arg.arg_repetition = eArgRepeatStar;
-
-  // There is only one variant this argument could be; put it into the argument
-  // entry.
-  arg.push_back(command_arg);
-
-  // Push the data for the first argument into the m_arguments vector.
-  m_arguments.push_back(arg);
+  AddSimpleArgumentList(eArgTypeCommand, eArgRepeatStar);
 }
 
 CommandObjectHelp::~CommandObjectHelp() = default;
diff --git a/lldb/source/Commands/CommandObjectLog.cpp b/lldb/source/Commands/CommandObjectLog.cpp
index 6bfbf98078e6..48dfd9456a66 100644
--- a/lldb/source/Commands/CommandObjectLog.cpp
+++ b/lldb/source/Commands/CommandObjectLog.cpp
@@ -288,19 +288,7 @@ public:
                             "List the log categories for one or more log "
                             "channels.  If none specified, lists them all.",
                             nullptr) {
-    CommandArgumentEntry arg;
-    CommandArgumentData channel_arg;
-
-    // Define the first (and only) variant of this arg.
-    channel_arg.arg_type = eArgTypeLogChannel;
-    channel_arg.arg_repetition = eArgRepeatStar;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(channel_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeLogChannel, eArgRepeatStar);
   }
 
   ~CommandObjectLogList() override = default;
@@ -335,19 +323,7 @@ public:
   CommandObjectLogDump(CommandInterpreter &interpreter)
       : CommandObjectParsed(interpreter, "log dump",
                             "dump circular buffer logs", nullptr) {
-    CommandArgumentEntry arg1;
-    CommandArgumentData channel_arg;
-
-    // Define the first (and only) variant of this arg.
-    channel_arg.arg_type = eArgTypeLogChannel;
-    channel_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg1.push_back(channel_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeLogChannel);
   }
 
   ~CommandObjectLogDump() override = default;
@@ -444,19 +420,7 @@ public:
       : CommandObjectParsed(interpreter, "log timers enable",
                             "enable LLDB internal performance timers",
                             "log timers enable <depth>") {
-    CommandArgumentEntry arg;
-    CommandArgumentData depth_arg;
-
-    // Define the first (and only) variant of this arg.
-    depth_arg.arg_type = eArgTypeCount;
-    depth_arg.arg_repetition = eArgRepeatOptional;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(depth_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeCount, eArgRepeatOptional);
   }
 
   ~CommandObjectLogTimerEnable() override = default;
@@ -559,19 +523,7 @@ public:
       : CommandObjectParsed(interpreter, "log timers increment",
                             "increment LLDB internal performance timers",
                             "log timers increment <bool>") {
-    CommandArgumentEntry arg;
-    CommandArgumentData bool_arg;
-
-    // Define the first (and only) variant of this arg.
-    bool_arg.arg_type = eArgTypeBoolean;
-    bool_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(bool_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeBoolean);
   }
 
   ~CommandObjectLogTimerIncrement() override = default;
diff --git a/lldb/source/Commands/CommandObjectPlatform.cpp b/lldb/source/Commands/CommandObjectPlatform.cpp
index b25c391bd4fa..5b18f2b60e92 100644
--- a/lldb/source/Commands/CommandObjectPlatform.cpp
+++ b/lldb/source/Commands/CommandObjectPlatform.cpp
@@ -155,8 +155,7 @@ public:
   {
     m_option_group.Append(&m_platform_options, LLDB_OPT_SET_ALL, 1);
     m_option_group.Finalize();
-    CommandArgumentData platform_arg{eArgTypePlatform, eArgRepeatPlain};
-    m_arguments.push_back({platform_arg});
+    AddSimpleArgumentList(eArgTypePlatform);
   }
 
   ~CommandObjectPlatformSelect() override = default;
@@ -276,8 +275,7 @@ public:
             interpreter, "platform connect",
             "Select the current platform by providing a connection URL.",
             "platform connect <connect-url>", 0) {
-    CommandArgumentData platform_arg{eArgTypeConnectURL, eArgRepeatPlain};
-    m_arguments.push_back({platform_arg});
+    AddSimpleArgumentList(eArgTypeConnectURL);
   }
 
   ~CommandObjectPlatformConnect() override = default;
@@ -418,8 +416,7 @@ public:
       : CommandObjectParsed(interpreter, "platform mkdir",
                             "Make a new directory on the remote end.", nullptr,
                             0) {
-    CommandArgumentData thread_arg{eArgTypeRemotePath, eArgRepeatPlain};
-    m_arguments.push_back({thread_arg});
+    AddSimpleArgumentList(eArgTypeRemotePath);
   }
 
   ~CommandObjectPlatformMkDir() override = default;
@@ -467,8 +464,7 @@ public:
   CommandObjectPlatformFOpen(CommandInterpreter &interpreter)
       : CommandObjectParsed(interpreter, "platform file open",
                             "Open a file on the remote end.", nullptr, 0) {
-    CommandArgumentData path_arg{eArgTypeRemotePath, eArgRepeatPlain};
-    m_arguments.push_back({path_arg});
+    AddSimpleArgumentList(eArgTypeRemotePath);
   }
 
   ~CommandObjectPlatformFOpen() override = default;
@@ -521,8 +517,7 @@ public:
   CommandObjectPlatformFClose(CommandInterpreter &interpreter)
       : CommandObjectParsed(interpreter, "platform file close",
                             "Close a file on the remote end.", nullptr, 0) {
-    CommandArgumentData path_arg{eArgTypeUnsignedInteger, eArgRepeatPlain};
-    m_arguments.push_back({path_arg});
+    AddSimpleArgumentList(eArgTypeUnsignedInteger);
   }
 
   ~CommandObjectPlatformFClose() override = default;
@@ -564,8 +559,7 @@ public:
       : CommandObjectParsed(interpreter, "platform file read",
                             "Read data from a file on the remote end.", nullptr,
                             0) {
-    CommandArgumentData path_arg{eArgTypeUnsignedInteger, eArgRepeatPlain};
-    m_arguments.push_back({path_arg});
+    AddSimpleArgumentList(eArgTypeUnsignedInteger);
   }
 
   ~CommandObjectPlatformFRead() override = default;
@@ -659,8 +653,7 @@ public:
       : CommandObjectParsed(interpreter, "platform file write",
                             "Write data to a file on the remote end.", nullptr,
                             0) {
-    CommandArgumentData path_arg{eArgTypeUnsignedInteger, eArgRepeatPlain};
-    m_arguments.push_back({path_arg});
+    AddSimpleArgumentList(eArgTypeUnsignedInteger);
   }
 
   ~CommandObjectPlatformFWrite() override = default;
@@ -863,18 +856,7 @@ public:
 
     Get the file size from the remote end with path /the/remote/file/path.)");
 
-    CommandArgumentEntry arg1;
-    CommandArgumentData file_arg_remote;
-
-    // Define the first (and only) variant of this arg.
-    file_arg_remote.arg_type = eArgTypeRemoteFilename;
-    file_arg_remote.arg_repetition = eArgRepeatPlain;
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg1.push_back(file_arg_remote);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeRemoteFilename);
   }
 
   ~CommandObjectPlatformGetSize() override = default;
@@ -922,18 +904,7 @@ public:
 
     Get the file permissions from the remote end with path /the/remote/file/path.)");
 
-    CommandArgumentEntry arg1;
-    CommandArgumentData file_arg_remote;
-
-    // Define the first (and only) variant of this arg.
-    file_arg_remote.arg_type = eArgTypeRemoteFilename;
-    file_arg_remote.arg_repetition = eArgRepeatPlain;
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg1.push_back(file_arg_remote);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeRemoteFilename);
   }
 
   ~CommandObjectPlatformGetPermissions() override = default;
@@ -980,18 +951,7 @@ public:
 
     Check if /the/remote/file/path exists on the remote end.)");
 
-    CommandArgumentEntry arg1;
-    CommandArgumentData file_arg_remote;
-
-    // Define the first (and only) variant of this arg.
-    file_arg_remote.arg_type = eArgTypeRemoteFilename;
-    file_arg_remote.arg_repetition = eArgRepeatPlain;
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg1.push_back(file_arg_remote);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeRemoteFilename);
   }
 
   ~CommandObjectPlatformFileExists() override = default;
@@ -1093,8 +1053,7 @@ public:
     m_all_options.Append(&m_class_options, LLDB_OPT_SET_1 | LLDB_OPT_SET_2,
                          LLDB_OPT_SET_ALL);
     m_all_options.Finalize();
-    CommandArgumentData run_arg_arg{eArgTypeRunArgs, eArgRepeatStar};
-    m_arguments.push_back({run_arg_arg});
+    AddSimpleArgumentList(eArgTypeRunArgs, eArgRepeatStar);
   }
 
   void
@@ -1503,19 +1462,7 @@ public:
             interpreter, "platform process info",
             "Get detailed information for one or more process by process ID.",
             "platform process info <pid> [<pid> <pid> ...]", 0) {
-    CommandArgumentEntry arg;
-    CommandArgumentData pid_args;
-
-    // Define the first (and only) variant of this arg.
-    pid_args.arg_type = eArgTypePid;
-    pid_args.arg_repetition = eArgRepeatStar;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(pid_args);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypePid, eArgRepeatStar);
   }
 
   ~CommandObjectPlatformProcessInfo() override = default;
@@ -1721,8 +1668,7 @@ public:
       : CommandObjectRaw(interpreter, "platform shell",
                          "Run a shell command on the current platform.",
                          "platform shell <shell-command>", 0) {
-    CommandArgumentData thread_arg{eArgTypeNone, eArgRepeatStar};
-    m_arguments.push_back({thread_arg});
+    AddSimpleArgumentList(eArgTypeNone, eArgRepeatStar);
   }
 
   ~CommandObjectPlatformShell() override = default;
diff --git a/lldb/source/Commands/CommandObjectPlugin.cpp b/lldb/source/Commands/CommandObjectPlugin.cpp
index da3b5f0518a6..f3108b8a768d 100644
--- a/lldb/source/Commands/CommandObjectPlugin.cpp
+++ b/lldb/source/Commands/CommandObjectPlugin.cpp
@@ -19,19 +19,7 @@ public:
       : CommandObjectParsed(interpreter, "plugin load",
                             "Import a dylib that implements an LLDB plugin.",
                             nullptr) {
-    CommandArgumentEntry arg1;
-    CommandArgumentData cmd_arg;
-
-    // Define the first (and only) variant of this arg.
-    cmd_arg.arg_type = eArgTypeFilename;
-    cmd_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg1.push_back(cmd_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeFilename);
   }
 
   ~CommandObjectPluginLoad() override = default;
diff --git a/lldb/source/Commands/CommandObjectProcess.cpp b/lldb/source/Commands/CommandObjectProcess.cpp
index 7cd5ad656f1b..9ac97eb66b62 100644
--- a/lldb/source/Commands/CommandObjectProcess.cpp
+++ b/lldb/source/Commands/CommandObjectProcess.cpp
@@ -126,19 +126,7 @@ public:
                          LLDB_OPT_SET_ALL);
     m_all_options.Finalize();
 
-    CommandArgumentEntry arg;
-    CommandArgumentData run_args_arg;
-
-    // Define the first (and only) variant of this arg.
-    run_args_arg.arg_type = eArgTypeRunArgs;
-    run_args_arg.arg_repetition = eArgRepeatOptional;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(run_args_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeRunArgs, eArgRepeatOptional);
   }
 
   ~CommandObjectProcessLaunch() override = default;
@@ -870,8 +858,7 @@ public:
       : CommandObjectParsed(interpreter, "process connect",
                             "Connect to a remote debug service.",
                             "process connect <remote-url>", 0) {
-    CommandArgumentData connect_arg{eArgTypeConnectURL, eArgRepeatPlain};
-    m_arguments.push_back({connect_arg});
+    AddSimpleArgumentList(eArgTypeConnectURL);
   }
 
   ~CommandObjectProcessConnect() override = default;
@@ -996,8 +983,7 @@ public:
                             eCommandRequiresProcess | eCommandTryTargetAPILock |
                                 eCommandProcessMustBeLaunched |
                                 eCommandProcessMustBePaused) {
-    CommandArgumentData file_arg{eArgTypePath, eArgRepeatPlus};
-    m_arguments.push_back({file_arg});
+    AddSimpleArgumentList(eArgTypePath, eArgRepeatPlus);
   }
 
   ~CommandObjectProcessLoad() override = default;
@@ -1070,8 +1056,7 @@ public:
             "process unload <index>",
             eCommandRequiresProcess | eCommandTryTargetAPILock |
                 eCommandProcessMustBeLaunched | eCommandProcessMustBePaused) {
-    CommandArgumentData load_idx_arg{eArgTypeUnsignedInteger, eArgRepeatPlain};
-    m_arguments.push_back({load_idx_arg});
+    AddSimpleArgumentList(eArgTypeUnsignedInteger);
   }
 
   ~CommandObjectProcessUnload() override = default;
@@ -1131,19 +1116,7 @@ public:
             interpreter, "process signal",
             "Send a UNIX signal to the current target process.", nullptr,
             eCommandRequiresProcess | eCommandTryTargetAPILock) {
-    CommandArgumentEntry arg;
-    CommandArgumentData signal_arg;
-
-    // Define the first (and only) variant of this arg.
-    signal_arg.arg_type = eArgTypeUnixSignal;
-    signal_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(signal_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeUnixSignal);
   }
 
   ~CommandObjectProcessSignal() override = default;
@@ -1274,8 +1247,7 @@ public:
             "process save-core [-s corefile-style -p plugin-name] FILE",
             eCommandRequiresProcess | eCommandTryTargetAPILock |
                 eCommandProcessMustBeLaunched) {
-    CommandArgumentData file_arg{eArgTypePath, eArgRepeatPlain};
-    m_arguments.push_back({file_arg});
+    AddSimpleArgumentList(eArgTypePath);
   }
 
   ~CommandObjectProcessSaveCore() override = default;
@@ -1559,15 +1531,7 @@ public:
                 "by passing the -t option."
                 "\nYou can also clear the target modification for a signal"
                 "by passing the -c option");
-    CommandArgumentEntry arg;
-    CommandArgumentData signal_arg;
-
-    signal_arg.arg_type = eArgTypeUnixSignal;
-    signal_arg.arg_repetition = eArgRepeatStar;
-
-    arg.push_back(signal_arg);
-
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeUnixSignal, eArgRepeatStar);
   }
 
   ~CommandObjectProcessHandle() override = default;
diff --git a/lldb/source/Commands/CommandObjectQuit.cpp b/lldb/source/Commands/CommandObjectQuit.cpp
index d7caf1546fb5..8e7830b2afc6 100644
--- a/lldb/source/Commands/CommandObjectQuit.cpp
+++ b/lldb/source/Commands/CommandObjectQuit.cpp
@@ -21,8 +21,7 @@ using namespace lldb_private;
 CommandObjectQuit::CommandObjectQuit(CommandInterpreter &interpreter)
     : CommandObjectParsed(interpreter, "quit", "Quit the LLDB debugger.",
                           "quit [exit-code]") {
-  CommandArgumentData exit_code_arg{eArgTypeUnsignedInteger, eArgRepeatPlain};
-  m_arguments.push_back({exit_code_arg});
+  AddSimpleArgumentList(eArgTypeUnsignedInteger);
 }
 
 CommandObjectQuit::~CommandObjectQuit() = default;
diff --git a/lldb/source/Commands/CommandObjectRegister.cpp b/lldb/source/Commands/CommandObjectRegister.cpp
index 4ffdde1ee09f..4e047ccbc10b 100644
--- a/lldb/source/Commands/CommandObjectRegister.cpp
+++ b/lldb/source/Commands/CommandObjectRegister.cpp
@@ -50,19 +50,7 @@ public:
                          {{CommandArgumentType::eArgTypeFormat,
                            "Specify a format to be used for display. If this "
                            "is set, register fields will not be displayed."}}) {
-    CommandArgumentEntry arg;
-    CommandArgumentData register_arg;
-
-    // Define the first (and only) variant of this arg.
-    register_arg.arg_type = eArgTypeRegisterName;
-    register_arg.arg_repetition = eArgRepeatStar;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(register_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeRegisterName, eArgRepeatStar);
 
     // Add the "--format"
     m_option_group.Append(&m_format_options,
@@ -422,13 +410,7 @@ Fields      (*)  A table of the names and bit positions of the values contained
 Fields marked with (*) may not always be present. Some information may be
 different for the same register when connected to different debug servers.)");
 
-    CommandArgumentData register_arg;
-    register_arg.arg_type = eArgTypeRegisterName;
-    register_arg.arg_repetition = eArgRepeatPlain;
-
-    CommandArgumentEntry arg1;
-    arg1.push_back(register_arg);
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeRegisterName);
   }
 
   ~CommandObjectRegisterInfo() override = default;
diff --git a/lldb/source/Commands/CommandObjectSession.cpp b/lldb/source/Commands/CommandObjectSession.cpp
index 28506d6c5951..c381ba4f74f1 100644
--- a/lldb/source/Commands/CommandObjectSession.cpp
+++ b/lldb/source/Commands/CommandObjectSession.cpp
@@ -21,9 +21,7 @@ public:
                             "If no file if specified, transcripts will be "
                             "saved to a temporary file.",
                             "session save [file]") {
-    CommandArgumentEntry arg1;
-    arg1.emplace_back(eArgTypePath, eArgRepeatOptional);
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypePath, eArgRepeatOptional);
   }
 
   ~CommandObjectSessionSave() override = default;
diff --git a/lldb/source/Commands/CommandObjectSettings.cpp b/lldb/source/Commands/CommandObjectSettings.cpp
index 0cf3d1daf7f5..7bbb0dd567ab 100644
--- a/lldb/source/Commands/CommandObjectSettings.cpp
+++ b/lldb/source/Commands/CommandObjectSettings.cpp
@@ -245,19 +245,7 @@ public:
                             "Show matching debugger settings and their current "
                             "values.  Defaults to showing all settings.",
                             nullptr) {
-    CommandArgumentEntry arg1;
-    CommandArgumentData var_name_arg;
-
-    // Define the first (and only) variant of this arg.
-    var_name_arg.arg_type = eArgTypeSettingVariableName;
-    var_name_arg.arg_repetition = eArgRepeatOptional;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg1.push_back(var_name_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeSettingVariableName, eArgRepeatOptional);
   }
 
   ~CommandObjectSettingsShow() override = default;
@@ -297,19 +285,7 @@ public:
             "current values to a file that can be read in with "
             "\"settings read\". Defaults to writing all settings.",
             nullptr) {
-    CommandArgumentEntry arg1;
-    CommandArgumentData var_name_arg;
-
-    // Define the first (and only) variant of this arg.
-    var_name_arg.arg_type = eArgTypeSettingVariableName;
-    var_name_arg.arg_repetition = eArgRepeatOptional;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg1.push_back(var_name_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeSettingVariableName, eArgRepeatOptional);
   }
 
   ~CommandObjectSettingsWrite() override = default;
@@ -997,19 +973,7 @@ public:
             interpreter, "settings clear",
             "Clear a debugger setting array, dictionary, or string. "
             "If '-a' option is specified, it clears all settings.", nullptr) {
-    CommandArgumentEntry arg;
-    CommandArgumentData var_name_arg;
-
-    // Define the first (and only) variant of this arg.
-    var_name_arg.arg_type = eArgTypeSettingVariableName;
-    var_name_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(var_name_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeSettingVariableName);
   }
 
   ~CommandObjectSettingsClear() override = default;
diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp
index 4e006e4bb0e0..45265577e8b6 100644
--- a/lldb/source/Commands/CommandObjectTarget.cpp
+++ b/lldb/source/Commands/CommandObjectTarget.cpp
@@ -229,19 +229,8 @@ public:
         m_remote_file(
             LLDB_OPT_SET_1, false, "remote-file", 'r', 0, eArgTypeFilename,
             "Fullpath to the file on the remote host if debugging remotely.") {
-    CommandArgumentEntry arg;
-    CommandArgumentData file_arg;
-
-    // Define the first (and only) variant of this arg.
-    file_arg.arg_type = eArgTypeFilename;
-    file_arg.arg_repetition = eArgRepeatPlain;
 
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(file_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeFilename);
 
     m_option_group.Append(&m_arch_option, LLDB_OPT_SET_ALL, LLDB_OPT_SET_1);
     m_option_group.Append(&m_platform_options, LLDB_OPT_SET_ALL, 1);
@@ -503,8 +492,7 @@ public:
       : CommandObjectParsed(
             interpreter, "target select",
             "Select a target as the current target by target index.", nullptr) {
-    CommandArgumentData target_arg{eArgTypeTargetID, eArgRepeatPlain};
-    m_arguments.push_back({target_arg});
+    AddSimpleArgumentList(eArgTypeTargetID);
   }
 
   ~CommandObjectTargetSelect() override = default;
@@ -586,8 +574,7 @@ public:
     m_option_group.Append(&m_all_option, LLDB_OPT_SET_ALL, LLDB_OPT_SET_1);
     m_option_group.Append(&m_cleanup_option, LLDB_OPT_SET_ALL, LLDB_OPT_SET_1);
     m_option_group.Finalize();
-    CommandArgumentData target_arg{eArgTypeTargetID, eArgRepeatStar};
-    m_arguments.push_back({target_arg});
+    AddSimpleArgumentList(eArgTypeTargetID, eArgRepeatStar);
   }
 
   ~CommandObjectTargetDelete() override = default;
@@ -729,19 +716,7 @@ public:
             "A basename or fullpath to a shared library to use in the search "
             "for global "
             "variables. This option can be specified multiple times.") {
-    CommandArgumentEntry arg;
-    CommandArgumentData var_name_arg;
-
-    // Define the first (and only) variant of this arg.
-    var_name_arg.arg_type = eArgTypeVarName;
-    var_name_arg.arg_repetition = eArgRepeatPlus;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(var_name_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeVarName, eArgRepeatPlus);
 
     m_option_group.Append(&m_varobj_options, LLDB_OPT_SET_ALL, LLDB_OPT_SET_1);
     m_option_group.Append(&m_option_variable, LLDB_OPT_SET_ALL, LLDB_OPT_SET_1);
@@ -1243,19 +1218,7 @@ public:
             interpreter, "target modules search-paths query",
             "Transform a path using the first applicable image search path.",
             nullptr, eCommandRequiresTarget) {
-    CommandArgumentEntry arg;
-    CommandArgumentData path_arg;
-
-    // Define the first (and only) variant of this arg.
-    path_arg.arg_type = eArgTypeDirectoryName;
-    path_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(path_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeDirectoryName);
   }
 
   ~CommandObjectTargetModulesSearchPathsQuery() override = default;
@@ -1881,19 +1844,7 @@ public:
                                                const char *syntax,
                                                uint32_t flags = 0)
       : CommandObjectParsed(interpreter, name, help, syntax, flags) {
-    CommandArgumentEntry arg;
-    CommandArgumentData file_arg;
-
-    // Define the first (and only) variant of this arg.
-    file_arg.arg_type = eArgTypeFilename;
-    file_arg.arg_repetition = eArgRepeatStar;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(file_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeFilename, eArgRepeatStar);
   }
 
   ~CommandObjectTargetModulesModuleAutoComplete() override = default;
@@ -1918,19 +1869,7 @@ public:
       CommandInterpreter &interpreter, const char *name, const char *help,
       const char *syntax, uint32_t flags)
       : CommandObjectParsed(interpreter, name, help, syntax, flags) {
-    CommandArgumentEntry arg;
-    CommandArgumentData source_file_arg;
-
-    // Define the first (and only) variant of this arg.
-    source_file_arg.arg_type = eArgTypeSourceFile;
-    source_file_arg.arg_repetition = eArgRepeatPlus;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(source_file_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeSourceFile, eArgRepeatPlus);
   }
 
   ~CommandObjectTargetModulesSourceFileAutoComplete() override = default;
@@ -2234,8 +2173,7 @@ public:
             interpreter, "target modules dump pcm-info",
             "Dump information about the given clang module (pcm).") {
     // Take a single file argument.
-    CommandArgumentData arg{eArgTypeFilename, eArgRepeatPlain};
-    m_arguments.push_back({arg});
+    AddSimpleArgumentList(eArgTypeFilename);
   }
 
   ~CommandObjectTargetModulesDumpClangPCMInfo() override = default;
@@ -2774,8 +2712,7 @@ public:
                           LLDB_OPT_SET_1);
     m_option_group.Append(&m_symbol_file, LLDB_OPT_SET_ALL, LLDB_OPT_SET_1);
     m_option_group.Finalize();
-    CommandArgumentData module_arg{eArgTypePath, eArgRepeatStar};
-    m_arguments.push_back({module_arg});
+    AddSimpleArgumentList(eArgTypePath, eArgRepeatStar);
   }
 
   ~CommandObjectTargetModulesAdd() override = default;
@@ -3219,8 +3156,7 @@ public:
       : CommandObjectParsed(
             interpreter, "target modules list",
             "List current executable and dependent shared library images.") {
-    CommandArgumentData module_arg{eArgTypeModule, eArgRepeatStar};
-    m_arguments.push_back({module_arg});
+    AddSimpleArgumentList(eArgTypeModule, eArgRepeatStar);
   }
 
   ~CommandObjectTargetModulesList() override = default;
@@ -3992,19 +3928,7 @@ public:
                             "Look up information within executable and "
                             "dependent shared library images.",
                             nullptr, eCommandRequiresTarget) {
-    CommandArgumentEntry arg;
-    CommandArgumentData file_arg;
-
-    // Define the first (and only) variant of this arg.
-    file_arg.arg_type = eArgTypeFilename;
-    file_arg.arg_repetition = eArgRepeatStar;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(file_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeFilename, eArgRepeatStar);
   }
 
   ~CommandObjectTargetModulesLookup() override = default;
@@ -4323,8 +4247,7 @@ public:
     m_option_group.Append(&m_current_stack_option, LLDB_OPT_SET_2,
                           LLDB_OPT_SET_2);
     m_option_group.Finalize();
-    CommandArgumentData module_arg{eArgTypeShlibName, eArgRepeatPlain};
-    m_arguments.push_back({module_arg});
+    AddSimpleArgumentList(eArgTypeShlibName);
   }
 
   ~CommandObjectTargetSymbolsAdd() override = default;
@@ -5163,8 +5086,7 @@ public:
       : CommandObjectParsed(interpreter, "target stop-hook delete",
                             "Delete a stop-hook.",
                             "target stop-hook delete [<idx>]") {
-    CommandArgumentData hook_arg{eArgTypeStopHookID, eArgRepeatStar};
-    m_arguments.push_back({hook_arg});
+    AddSimpleArgumentList(eArgTypeStopHookID, eArgRepeatStar);
   }
 
   ~CommandObjectTargetStopHookDelete() override = default;
@@ -5218,8 +5140,7 @@ public:
                                            bool enable, const char *name,
                                            const char *help, const char *syntax)
       : CommandObjectParsed(interpreter, name, help, syntax), m_enable(enable) {
-    CommandArgumentData hook_arg{eArgTypeStopHookID, eArgRepeatStar};
-    m_arguments.push_back({hook_arg});
+    AddSimpleArgumentList(eArgTypeStopHookID, eArgRepeatStar);
   }
 
   ~CommandObjectTargetStopHookEnableDisable() override = default;
diff --git a/lldb/source/Commands/CommandObjectThread.cpp b/lldb/source/Commands/CommandObjectThread.cpp
index 52e493b13c61..9cfff059d6bf 100644
--- a/lldb/source/Commands/CommandObjectThread.cpp
+++ b/lldb/source/Commands/CommandObjectThread.cpp
@@ -374,19 +374,7 @@ public:
                                 eCommandProcessMustBePaused),
         m_step_type(step_type), m_step_scope(step_scope),
         m_class_options("scripted step") {
-    CommandArgumentEntry arg;
-    CommandArgumentData thread_id_arg;
-
-    // Define the first (and only) variant of this arg.
-    thread_id_arg.arg_type = eArgTypeThreadID;
-    thread_id_arg.arg_repetition = eArgRepeatOptional;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(thread_id_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeThreadID, eArgRepeatOptional);
 
     if (step_type == eStepTypeScripted) {
       m_all_options.Append(&m_class_options, LLDB_OPT_SET_1 | LLDB_OPT_SET_2,
@@ -643,19 +631,7 @@ public:
             nullptr,
             eCommandRequiresThread | eCommandTryTargetAPILock |
                 eCommandProcessMustBeLaunched | eCommandProcessMustBePaused) {
-    CommandArgumentEntry arg;
-    CommandArgumentData thread_idx_arg;
-
-    // Define the first (and only) variant of this arg.
-    thread_idx_arg.arg_type = eArgTypeThreadIndex;
-    thread_idx_arg.arg_repetition = eArgRepeatPlus;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(thread_idx_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeThreadIndex, eArgRepeatPlus);
   }
 
   ~CommandObjectThreadContinue() override = default;
@@ -886,19 +862,7 @@ public:
             nullptr,
             eCommandRequiresThread | eCommandTryTargetAPILock |
                 eCommandProcessMustBeLaunched | eCommandProcessMustBePaused) {
-    CommandArgumentEntry arg;
-    CommandArgumentData line_num_arg;
-
-    // Define the first (and only) variant of this arg.
-    line_num_arg.arg_type = eArgTypeLineNum;
-    line_num_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(line_num_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeLineNum);
   }
 
   ~CommandObjectThreadUntil() override = default;
@@ -1539,19 +1503,7 @@ public:
                          eCommandRequiresFrame | eCommandTryTargetAPILock |
                              eCommandProcessMustBeLaunched |
                              eCommandProcessMustBePaused) {
-    CommandArgumentEntry arg;
-    CommandArgumentData expression_arg;
-
-    // Define the first (and only) variant of this arg.
-    expression_arg.arg_type = eArgTypeExpression;
-    expression_arg.arg_repetition = eArgRepeatOptional;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(expression_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeExpression, eArgRepeatOptional);
   }
 
   ~CommandObjectThreadReturn() override = default;
@@ -1919,19 +1871,7 @@ public:
                                 eCommandTryTargetAPILock |
                                 eCommandProcessMustBeLaunched |
                                 eCommandProcessMustBePaused) {
-    CommandArgumentEntry arg;
-    CommandArgumentData plan_index_arg;
-
-    // Define the first (and only) variant of this arg.
-    plan_index_arg.arg_type = eArgTypeUnsignedInteger;
-    plan_index_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(plan_index_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeUnsignedInteger);
   }
 
   ~CommandObjectThreadPlanDiscard() override = default;
@@ -1992,19 +1932,7 @@ public:
                                 eCommandTryTargetAPILock |
                                 eCommandProcessMustBeLaunched |
                                 eCommandProcessMustBePaused) {
-    CommandArgumentEntry arg;
-    CommandArgumentData tid_arg;
-
-    // Define the first (and only) variant of this arg.
-    tid_arg.arg_type = eArgTypeThreadID;
-    tid_arg.arg_repetition = eArgRepeatStar;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(tid_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeThreadID, eArgRepeatStar);
   }
 
   ~CommandObjectThreadPlanPrune() override = default;
@@ -2221,8 +2149,7 @@ public:
             eCommandRequiresProcess | eCommandRequiresThread |
                 eCommandTryTargetAPILock | eCommandProcessMustBeLaunched |
                 eCommandProcessMustBePaused | eCommandProcessMustBeTraced) {
-    CommandArgumentData thread_arg{eArgTypeThreadIndex, eArgRepeatOptional};
-    m_arguments.push_back({thread_arg});
+    AddSimpleArgumentList(eArgTypeThreadIndex, eArgRepeatOptional);
   }
 
   ~CommandObjectTraceDumpFunctionCalls() override = default;
@@ -2395,8 +2322,7 @@ public:
             eCommandRequiresProcess | eCommandRequiresThread |
                 eCommandTryTargetAPILock | eCommandProcessMustBeLaunched |
                 eCommandProcessMustBePaused | eCommandProcessMustBeTraced) {
-    CommandArgumentData thread_arg{eArgTypeThreadIndex, eArgRepeatOptional};
-    m_arguments.push_back({thread_arg});
+    AddSimpleArgumentList(eArgTypeThreadIndex, eArgRepeatOptional);
   }
 
   ~CommandObjectTraceDumpInstructions() override = default;
diff --git a/lldb/source/Commands/CommandObjectThreadUtil.cpp b/lldb/source/Commands/CommandObjectThreadUtil.cpp
index d7fa4190a245..cdc5946547f4 100644
--- a/lldb/source/Commands/CommandObjectThreadUtil.cpp
+++ b/lldb/source/Commands/CommandObjectThreadUtil.cpp
@@ -21,8 +21,7 @@ CommandObjectIterateOverThreads::CommandObjectIterateOverThreads(
     const char *syntax, uint32_t flags)
     : CommandObjectParsed(interpreter, name, help, syntax, flags) {
   // These commands all take thread ID's as arguments.
-  CommandArgumentData thread_arg{eArgTypeThreadIndex, eArgRepeatStar};
-  m_arguments.push_back({thread_arg});
+  AddSimpleArgumentList(eArgTypeThreadIndex, eArgRepeatStar);
 }
 
 CommandObjectMultipleThreads::CommandObjectMultipleThreads(
@@ -30,8 +29,7 @@ CommandObjectMultipleThreads::CommandObjectMultipleThreads(
     const char *syntax, uint32_t flags)
     : CommandObjectParsed(interpreter, name, help, syntax, flags) {
   // These commands all take thread ID's as arguments.
-  CommandArgumentData thread_arg{eArgTypeThreadIndex, eArgRepeatStar};
-  m_arguments.push_back({thread_arg});
+  AddSimpleArgumentList(eArgTypeThreadIndex, eArgRepeatStar);
 }
 
 void CommandObjectIterateOverThreads::DoExecute(Args &command,
diff --git a/lldb/source/Commands/CommandObjectTrace.cpp b/lldb/source/Commands/CommandObjectTrace.cpp
index e0c74e29aaa6..5bcbc236301c 100644
--- a/lldb/source/Commands/CommandObjectTrace.cpp
+++ b/lldb/source/Commands/CommandObjectTrace.cpp
@@ -89,8 +89,7 @@ public:
             eCommandRequiresProcess | eCommandTryTargetAPILock |
                 eCommandProcessMustBeLaunched | eCommandProcessMustBePaused |
                 eCommandProcessMustBeTraced) {
-    CommandArgumentData bundle_dir{eArgTypeDirectoryName, eArgRepeatPlain};
-    m_arguments.push_back({bundle_dir});
+    AddSimpleArgumentList(eArgTypeDirectoryName);
   }
 
   void
@@ -176,8 +175,7 @@ public:
             interpreter, "trace load",
             "Load a post-mortem processor trace session from a trace bundle.",
             "trace load <trace_description_file>") {
-    CommandArgumentData session_file_arg{eArgTypeFilename, eArgRepeatPlain};
-    m_arguments.push_back({session_file_arg});
+    AddSimpleArgumentList(eArgTypeFilename);
   }
 
   void
@@ -332,8 +330,7 @@ public:
                             "Show the schema of the given trace plugin.",
                             "trace schema <plug-in>. Use the plug-in name "
                             "\"all\" to see all schemas.\n") {
-    CommandArgumentData plugin_arg{eArgTypeNone, eArgRepeatPlain};
-    m_arguments.push_back({plugin_arg});
+    AddSimpleArgumentList(eArgTypeNone);
   }
 
   ~CommandObjectTraceSchema() override = default;
diff --git a/lldb/source/Commands/CommandObjectType.cpp b/lldb/source/Commands/CommandObjectType.cpp
index 036b8e9d9def..97489bdc2d9c 100644
--- a/lldb/source/Commands/CommandObjectType.cpp
+++ b/lldb/source/Commands/CommandObjectType.cpp
@@ -589,15 +589,7 @@ public:
       : CommandObjectParsed(interpreter, "type format add",
                             "Add a new formatting style for a type.", nullptr),
         m_format_options(eFormatInvalid) {
-    CommandArgumentEntry type_arg;
-    CommandArgumentData type_style_arg;
-
-    type_style_arg.arg_type = eArgTypeName;
-    type_style_arg.arg_repetition = eArgRepeatPlus;
-
-    type_arg.push_back(type_style_arg);
-
-    m_arguments.push_back(type_arg);
+    AddSimpleArgumentList(eArgTypeName, eArgRepeatPlus);
 
     SetHelpLong(
         R"(
@@ -784,15 +776,7 @@ public:
       : CommandObjectParsed(interpreter,
                             FormatCategoryToString(formatter_kind, false)),
         m_formatter_kind(formatter_kind) {
-    CommandArgumentEntry type_arg;
-    CommandArgumentData type_style_arg;
-
-    type_style_arg.arg_type = eArgTypeName;
-    type_style_arg.arg_repetition = eArgRepeatPlain;
-
-    type_arg.push_back(type_style_arg);
-
-    m_arguments.push_back(type_arg);
+    AddSimpleArgumentList(eArgTypeName);
 
     const char *kind = FormatCategoryToString(formatter_kind, true);
     const char *short_kind = FormatCategoryToString(formatter_kind, false);
@@ -929,8 +913,7 @@ public:
                                   const char *name, const char *help)
       : CommandObjectParsed(interpreter, name, help, nullptr),
         m_formatter_kind(formatter_kind) {
-    CommandArgumentData category_arg{eArgTypeName, eArgRepeatOptional};
-    m_arguments.push_back({category_arg});
+    AddSimpleArgumentList(eArgTypeName, eArgRepeatOptional);
   }
 
   ~CommandObjectTypeFormatterClear() override = default;
@@ -1045,15 +1028,7 @@ public:
   CommandObjectTypeFormatterList(CommandInterpreter &interpreter,
                                  const char *name, const char *help)
       : CommandObjectParsed(interpreter, name, help, nullptr), m_options() {
-    CommandArgumentEntry type_arg;
-    CommandArgumentData type_style_arg;
-
-    type_style_arg.arg_type = eArgTypeName;
-    type_style_arg.arg_repetition = eArgRepeatOptional;
-
-    type_arg.push_back(type_style_arg);
-
-    m_arguments.push_back(type_arg);
+    AddSimpleArgumentList(eArgTypeName, eArgRepeatOptional);
   }
 
   ~CommandObjectTypeFormatterList() override = default;
@@ -1445,15 +1420,7 @@ CommandObjectTypeSummaryAdd::CommandObjectTypeSummaryAdd(
     : CommandObjectParsed(interpreter, "type summary add",
                           "Add a new summary style for a type.", nullptr),
       IOHandlerDelegateMultiline("DONE"), m_options(interpreter) {
-  CommandArgumentEntry type_arg;
-  CommandArgumentData type_style_arg;
-
-  type_style_arg.arg_type = eArgTypeName;
-  type_style_arg.arg_repetition = eArgRepeatPlus;
-
-  type_arg.push_back(type_style_arg);
-
-  m_arguments.push_back(type_arg);
+  AddSimpleArgumentList(eArgTypeName, eArgRepeatPlus);
 
   SetHelpLong(
       R"(
@@ -1745,15 +1712,7 @@ public:
       : CommandObjectParsed(interpreter, "type category define",
                             "Define a new category as a source of formatters.",
                             nullptr) {
-    CommandArgumentEntry type_arg;
-    CommandArgumentData type_style_arg;
-
-    type_style_arg.arg_type = eArgTypeName;
-    type_style_arg.arg_repetition = eArgRepeatPlus;
-
-    type_arg.push_back(type_style_arg);
-
-    m_arguments.push_back(type_arg);
+    AddSimpleArgumentList(eArgTypeName, eArgRepeatPlus);
   }
 
   ~CommandObjectTypeCategoryDefine() override = default;
@@ -1838,15 +1797,7 @@ public:
       : CommandObjectParsed(interpreter, "type category enable",
                             "Enable a category as a source of formatters.",
                             nullptr) {
-    CommandArgumentEntry type_arg;
-    CommandArgumentData type_style_arg;
-
-    type_style_arg.arg_type = eArgTypeName;
-    type_style_arg.arg_repetition = eArgRepeatPlus;
-
-    type_arg.push_back(type_style_arg);
-
-    m_arguments.push_back(type_arg);
+    AddSimpleArgumentList(eArgTypeName, eArgRepeatPlus);
   }
 
   ~CommandObjectTypeCategoryEnable() override = default;
@@ -1897,15 +1848,7 @@ public:
       : CommandObjectParsed(interpreter, "type category delete",
                             "Delete a category and all associated formatters.",
                             nullptr) {
-    CommandArgumentEntry type_arg;
-    CommandArgumentData type_style_arg;
-
-    type_style_arg.arg_type = eArgTypeName;
-    type_style_arg.arg_repetition = eArgRepeatPlus;
-
-    type_arg.push_back(type_style_arg);
-
-    m_arguments.push_back(type_arg);
+    AddSimpleArgumentList(eArgTypeName, eArgRepeatPlus);
   }
 
   ~CommandObjectTypeCategoryDelete() override = default;
@@ -1996,15 +1939,7 @@ public:
       : CommandObjectParsed(interpreter, "type category disable",
                             "Disable a category as a source of formatters.",
                             nullptr) {
-    CommandArgumentEntry type_arg;
-    CommandArgumentData type_style_arg;
-
-    type_style_arg.arg_type = eArgTypeName;
-    type_style_arg.arg_repetition = eArgRepeatPlus;
-
-    type_arg.push_back(type_style_arg);
-
-    m_arguments.push_back(type_arg);
+    AddSimpleArgumentList(eArgTypeName, eArgRepeatPlus);
   }
 
   ~CommandObjectTypeCategoryDisable() override = default;
@@ -2050,15 +1985,7 @@ public:
       : CommandObjectParsed(interpreter, "type category list",
                             "Provide a list of all existing categories.",
                             nullptr) {
-    CommandArgumentEntry type_arg;
-    CommandArgumentData type_style_arg;
-
-    type_style_arg.arg_type = eArgTypeName;
-    type_style_arg.arg_repetition = eArgRepeatOptional;
-
-    type_arg.push_back(type_style_arg);
-
-    m_arguments.push_back(type_arg);
+    AddSimpleArgumentList(eArgTypeName, eArgRepeatOptional);
   }
 
   ~CommandObjectTypeCategoryList() override = default;
@@ -2271,15 +2198,7 @@ CommandObjectTypeSynthAdd::CommandObjectTypeSynthAdd(
     : CommandObjectParsed(interpreter, "type synthetic add",
                           "Add a new synthetic provider for a type.", nullptr),
       IOHandlerDelegateMultiline("DONE"), m_options() {
-  CommandArgumentEntry type_arg;
-  CommandArgumentData type_style_arg;
-
-  type_style_arg.arg_type = eArgTypeName;
-  type_style_arg.arg_repetition = eArgRepeatPlus;
-
-  type_arg.push_back(type_style_arg);
-
-  m_arguments.push_back(type_arg);
+  AddSimpleArgumentList(eArgTypeName, eArgRepeatPlus);
 }
 
 bool CommandObjectTypeSynthAdd::AddSynth(ConstString type_name,
@@ -2476,15 +2395,7 @@ public:
   CommandObjectTypeFilterAdd(CommandInterpreter &interpreter)
       : CommandObjectParsed(interpreter, "type filter add",
                             "Add a new filter for a type.", nullptr) {
-    CommandArgumentEntry type_arg;
-    CommandArgumentData type_style_arg;
-
-    type_style_arg.arg_type = eArgTypeName;
-    type_style_arg.arg_repetition = eArgRepeatPlus;
-
-    type_arg.push_back(type_style_arg);
-
-    m_arguments.push_back(type_arg);
+    AddSimpleArgumentList(eArgTypeName, eArgRepeatPlus);
 
     SetHelpLong(
         R"(
diff --git a/lldb/source/Commands/CommandObjectWatchpoint.cpp b/lldb/source/Commands/CommandObjectWatchpoint.cpp
index 5b74b1ae43ac..f123211e7237 100644
--- a/lldb/source/Commands/CommandObjectWatchpoint.cpp
+++ b/lldb/source/Commands/CommandObjectWatchpoint.cpp
@@ -153,12 +153,7 @@ public:
             interpreter, "watchpoint list",
             "List all watchpoints at configurable levels of detail.", nullptr,
             eCommandRequiresTarget) {
-    CommandArgumentEntry arg;
-    CommandObject::AddIDsArgumentData(arg, eArgTypeWatchpointID,
-                                      eArgTypeWatchpointIDRange);
-    // Add the entry for the first argument for this command to the object's
-    // arguments vector.
-    m_arguments.push_back(arg);
+    CommandObject::AddIDsArgumentData(eWatchpointArgs);
   }
 
   ~CommandObjectWatchpointList() override = default;
@@ -276,12 +271,7 @@ public:
                             "Enable the specified disabled watchpoint(s). If "
                             "no watchpoints are specified, enable all of them.",
                             nullptr, eCommandRequiresTarget) {
-    CommandArgumentEntry arg;
-    CommandObject::AddIDsArgumentData(arg, eArgTypeWatchpointID,
-                                      eArgTypeWatchpointIDRange);
-    // Add the entry for the first argument for this command to the object's
-    // arguments vector.
-    m_arguments.push_back(arg);
+    CommandObject::AddIDsArgumentData(eWatchpointArgs);
   }
 
   ~CommandObjectWatchpointEnable() override = default;
@@ -350,12 +340,7 @@ public:
                             "removing it/them.  If no watchpoints are "
                             "specified, disable them all.",
                             nullptr, eCommandRequiresTarget) {
-    CommandArgumentEntry arg;
-    CommandObject::AddIDsArgumentData(arg, eArgTypeWatchpointID,
-                                      eArgTypeWatchpointIDRange);
-    // Add the entry for the first argument for this command to the object's
-    // arguments vector.
-    m_arguments.push_back(arg);
+    CommandObject::AddIDsArgumentData(eWatchpointArgs);
   }
 
   ~CommandObjectWatchpointDisable() override = default;
@@ -429,12 +414,7 @@ public:
                             "Delete the specified watchpoint(s).  If no "
                             "watchpoints are specified, delete them all.",
                             nullptr, eCommandRequiresTarget) {
-    CommandArgumentEntry arg;
-    CommandObject::AddIDsArgumentData(arg, eArgTypeWatchpointID,
-                                      eArgTypeWatchpointIDRange);
-    // Add the entry for the first argument for this command to the object's
-    // arguments vector.
-    m_arguments.push_back(arg);
+    CommandObject::AddIDsArgumentData(eWatchpointArgs);
   }
 
   ~CommandObjectWatchpointDelete() override = default;
@@ -550,12 +530,7 @@ public:
                             "Set ignore count on the specified watchpoint(s).  "
                             "If no watchpoints are specified, set them all.",
                             nullptr, eCommandRequiresTarget) {
-    CommandArgumentEntry arg;
-    CommandObject::AddIDsArgumentData(arg, eArgTypeWatchpointID,
-                                      eArgTypeWatchpointIDRange);
-    // Add the entry for the first argument for this command to the object's
-    // arguments vector.
-    m_arguments.push_back(arg);
+    CommandObject::AddIDsArgumentData(eWatchpointArgs);
   }
 
   ~CommandObjectWatchpointIgnore() override = default;
@@ -673,12 +648,7 @@ public:
             "watchpoint.  "
             "Passing an empty argument clears the modification.",
             nullptr, eCommandRequiresTarget) {
-    CommandArgumentEntry arg;
-    CommandObject::AddIDsArgumentData(arg, eArgTypeWatchpointID,
-                                      eArgTypeWatchpointIDRange);
-    // Add the entry for the first argument for this command to the object's
-    // arguments vector.
-    m_arguments.push_back(arg);
+    CommandObject::AddIDsArgumentData(eWatchpointArgs);
   }
 
   ~CommandObjectWatchpointModify() override = default;
@@ -811,18 +781,7 @@ Examples:
         "    Watches my_global_var for read/write access, with the region to watch \
 corresponding to the byte size of the data type.");
 
-    CommandArgumentEntry arg;
-    CommandArgumentData var_name_arg;
-
-    // Define the only variant of this arg.
-    var_name_arg.arg_type = eArgTypeVarName;
-    var_name_arg.arg_repetition = eArgRepeatPlain;
-
-    // Push the variant into the argument entry.
-    arg.push_back(var_name_arg);
-
-    // Push the data for the only argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeVarName);
 
     // Absorb the '-w' and '-s' options into our option group.
     m_option_group.Append(&m_option_watchpoint, LLDB_OPT_SET_1, LLDB_OPT_SET_1);
@@ -1009,18 +968,7 @@ Examples:
 
     Watches write access for the 1-byte region pointed to by the address 'foo + 32')");
 
-    CommandArgumentEntry arg;
-    CommandArgumentData expression_arg;
-
-    // Define the only variant of this arg.
-    expression_arg.arg_type = eArgTypeExpression;
-    expression_arg.arg_repetition = eArgRepeatPlain;
-
-    // Push the only variant into the argument entry.
-    arg.push_back(expression_arg);
-
-    // Push the data for the only argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeExpression);
 
     // Absorb the '-w' and '-s' options into our option group.
     m_option_group.Append(&m_option_watchpoint, LLDB_OPT_SET_ALL,
diff --git a/lldb/source/Commands/CommandObjectWatchpointCommand.cpp b/lldb/source/Commands/CommandObjectWatchpointCommand.cpp
index b1629ceab270..aaf14540cb28 100644
--- a/lldb/source/Commands/CommandObjectWatchpointCommand.cpp
+++ b/lldb/source/Commands/CommandObjectWatchpointCommand.cpp
@@ -162,19 +162,7 @@ initialized:"
         "Final Note: A warning that no watchpoint command was generated when there \
 are no syntax errors may indicate that a function was declared but never called.");
 
-    CommandArgumentEntry arg;
-    CommandArgumentData wp_id_arg;
-
-    // Define the first (and only) variant of this arg.
-    wp_id_arg.arg_type = eArgTypeWatchpointID;
-    wp_id_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(wp_id_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeWatchpointID);
   }
 
   ~CommandObjectWatchpointCommandAdd() override = default;
@@ -455,19 +443,7 @@ public:
       : CommandObjectParsed(interpreter, "delete",
                             "Delete the set of commands from a watchpoint.",
                             nullptr, eCommandRequiresTarget) {
-    CommandArgumentEntry arg;
-    CommandArgumentData wp_id_arg;
-
-    // Define the first (and only) variant of this arg.
-    wp_id_arg.arg_type = eArgTypeWatchpointID;
-    wp_id_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(wp_id_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeWatchpointID);
   }
 
   ~CommandObjectWatchpointCommandDelete() override = default;
@@ -522,19 +498,7 @@ public:
                             "List the script or set of commands to be executed "
                             "when the watchpoint is hit.",
                             nullptr, eCommandRequiresTarget) {
-    CommandArgumentEntry arg;
-    CommandArgumentData wp_id_arg;
-
-    // Define the first (and only) variant of this arg.
-    wp_id_arg.arg_type = eArgTypeWatchpointID;
-    wp_id_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(wp_id_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeWatchpointID);
   }
 
   ~CommandObjectWatchpointCommandList() override = default;
diff --git a/lldb/source/Host/common/Editline.cpp b/lldb/source/Host/common/Editline.cpp
index ce707e530d00..e66271e8a6ee 100644
--- a/lldb/source/Host/common/Editline.cpp
+++ b/lldb/source/Host/common/Editline.cpp
@@ -1029,8 +1029,11 @@ unsigned char Editline::TabCommand(int ch) {
     case CompletionMode::Normal: {
       std::string to_add = completion.GetCompletion();
       // Terminate the current argument with a quote if it started with a quote.
-      if (!request.GetParsedLine().empty() && request.GetParsedArg().IsQuoted())
+      Args &parsedLine = request.GetParsedLine();
+      if (!parsedLine.empty() && request.GetCursorIndex() < parsedLine.size() &&
+          request.GetParsedArg().IsQuoted()) {
         to_add.push_back(request.GetParsedArg().GetQuoteChar());
+      }
       to_add.push_back(' ');
       el_deletestr(m_editline, request.GetCursorArgumentPrefix().size());
       el_insertstr(m_editline, to_add.c_str());
diff --git a/lldb/source/Interpreter/CommandObject.cpp b/lldb/source/Interpreter/CommandObject.cpp
index 93c53e89f7d1..4634b75c6a33 100644
--- a/lldb/source/Interpreter/CommandObject.cpp
+++ b/lldb/source/Interpreter/CommandObject.cpp
@@ -392,6 +392,24 @@ bool CommandObject::ParseOptionsAndNotify(Args &args,
   return true;
 }
 
+void CommandObject::AddSimpleArgumentList(
+    CommandArgumentType arg_type, ArgumentRepetitionType repetition_type) {
+
+  CommandArgumentEntry arg_entry;
+  CommandArgumentData simple_arg;
+
+  // Define the first (and only) variant of this arg.
+  simple_arg.arg_type = arg_type;
+  simple_arg.arg_repetition = repetition_type;
+
+  // There is only one variant this argument could be; put it into the argument
+  // entry.
+  arg_entry.push_back(simple_arg);
+
+  // Push the data for the first argument into the m_arguments vector.
+  m_arguments.push_back(arg_entry);
+}
+
 int CommandObject::GetNumArgumentEntries() { return m_arguments.size(); }
 
 CommandObject::CommandArgumentEntry *
@@ -694,20 +712,24 @@ void CommandObject::GenerateHelpText(Stream &output_strm) {
   }
 }
 
-void CommandObject::AddIDsArgumentData(CommandArgumentEntry &arg,
-                                       CommandArgumentType ID,
-                                       CommandArgumentType IDRange) {
+void CommandObject::AddIDsArgumentData(CommandObject::IDType type) {
+  CommandArgumentEntry arg;
   CommandArgumentData id_arg;
   CommandArgumentData id_range_arg;
 
   // Create the first variant for the first (and only) argument for this
   // command.
-  id_arg.arg_type = ID;
+  switch (type) {
+  case eBreakpointArgs:
+    id_arg.arg_type = eArgTypeBreakpointID;
+    id_range_arg.arg_type = eArgTypeBreakpointIDRange;
+    break;
+  case eWatchpointArgs:
+    id_arg.arg_type = eArgTypeWatchpointID;
+    id_range_arg.arg_type = eArgTypeWatchpointIDRange;
+    break;
+  }
   id_arg.arg_repetition = eArgRepeatOptional;
-
-  // Create the second variant for the first (and only) argument for this
-  // command.
-  id_range_arg.arg_type = IDRange;
   id_range_arg.arg_repetition = eArgRepeatOptional;
 
   // The first (and only) argument for this command could be either an id or an
@@ -715,6 +737,7 @@ void CommandObject::AddIDsArgumentData(CommandArgumentEntry &arg,
   // this command.
   arg.push_back(id_arg);
   arg.push_back(id_range_arg);
+  m_arguments.push_back(arg);
 }
 
 const char *CommandObject::GetArgumentTypeAsCString(
diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp
index 47b1db16f1e9..7af768aad0bc 100644
--- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp
@@ -419,19 +419,7 @@ public:
       : CommandObjectParsed(
             interpreter, "demangle", "Demangle a C++ mangled name.",
             "language cplusplus demangle [<mangled-name> ...]") {
-    CommandArgumentEntry arg;
-    CommandArgumentData index_arg;
-
-    // Define the first (and only) variant of this arg.
-    index_arg.arg_type = eArgTypeSymbol;
-    index_arg.arg_repetition = eArgRepeatPlus;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(index_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeSymbol, eArgRepeatPlus);
   }
 
   ~CommandObjectMultiwordItaniumABI_Demangle() override = default;
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
index f380d6e7e672..3e5ee6f66373 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
@@ -897,19 +897,7 @@ public:
                                 eCommandProcessMustBeLaunched |
                                 eCommandProcessMustBePaused),
         m_options() {
-    CommandArgumentEntry arg;
-    CommandArgumentData index_arg;
-
-    // Define the first (and only) variant of this arg.
-    index_arg.arg_type = eArgTypeRegularExpression;
-    index_arg.arg_repetition = eArgRepeatOptional;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(index_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeRegularExpression, eArgRepeatOptional);
   }
 
   ~CommandObjectObjC_ClassTable_Dump() override = default;
@@ -1015,19 +1003,7 @@ public:
             "language objc tagged-pointer info",
             eCommandRequiresProcess | eCommandProcessMustBeLaunched |
                 eCommandProcessMustBePaused) {
-    CommandArgumentEntry arg;
-    CommandArgumentData index_arg;
-
-    // Define the first (and only) variant of this arg.
-    index_arg.arg_type = eArgTypeAddress;
-    index_arg.arg_repetition = eArgRepeatPlus;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(index_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeAddress, eArgRepeatPlus);
   }
 
   ~CommandObjectMultiwordObjC_TaggedPointer_Info() override = default;
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
index 6f8aa2622899..1f6116967a4f 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
@@ -17,6 +17,7 @@
 
 #include "lldb/Core/ModuleSpec.h"
 #include "lldb/Host/HostInfo.h"
+#include "lldb/Host/SafeMachO.h"
 #include "lldb/Host/XML.h"
 #include "lldb/Symbol/Symbol.h"
 #include "lldb/Target/MemoryRegionInfo.h"
@@ -2147,8 +2148,20 @@ bool GDBRemoteCommunicationClient::GetCurrentProcessInfo(bool allow_lazy) {
           if (!value.getAsInteger(16, cpu))
             ++num_keys_decoded;
         } else if (name.equals("cpusubtype")) {
-          if (!value.getAsInteger(16, sub))
+          if (!value.getAsInteger(16, sub)) {
             ++num_keys_decoded;
+            // Workaround for pre-2024 Apple debugserver, which always
+            // returns arm64e on arm64e-capable hardware regardless of
+            // what the process is. This can be deleted at some point
+            // in the future.
+            if (cpu == llvm::MachO::CPU_TYPE_ARM64 &&
+                sub == llvm::MachO::CPU_SUBTYPE_ARM64E) {
+              if (GetGDBServerVersion())
+                if (m_gdb_server_version >= 1000 &&
+                    m_gdb_server_version <= 1504)
+                  sub = 0;
+            }
+          }
         } else if (name.equals("triple")) {
           StringExtractor extractor(value);
           extractor.GetHexByteString(triple);
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 3dc40ee6bb9f..51ceb12f1a57 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -5354,8 +5354,7 @@ public:
             interpreter, "process plugin packet xfer-size",
             "Maximum size that lldb will try to read/write one one chunk.",
             nullptr) {
-    CommandArgumentData max_arg{eArgTypeUnsignedInteger, eArgRepeatPlain};
-    m_arguments.push_back({max_arg});
+    AddSimpleArgumentList(eArgTypeUnsignedInteger);
   }
 
   ~CommandObjectProcessGDBRemotePacketXferSize() override = default;
@@ -5397,8 +5396,7 @@ public:
                             "be added to the packet prior to sending and "
                             "stripped from the result.",
                             nullptr) {
-    CommandArgumentData packet_arg{eArgTypeNone, eArgRepeatStar};
-    m_arguments.push_back({packet_arg});
+    AddSimpleArgumentList(eArgTypeNone, eArgRepeatStar);
   }
 
   ~CommandObjectProcessGDBRemotePacketSend() override = default;
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index e17bfcb5d5e2..e982a30a3ae4 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -1568,14 +1568,8 @@ bool Target::SetArchitecture(const ArchSpec &arch_spec, bool set_platform,
 
       if (m_arch.GetSpec().IsCompatibleMatch(other)) {
         compatible_local_arch = true;
-        bool arch_changed, vendor_changed, os_changed, os_ver_changed,
-            env_changed;
 
-        m_arch.GetSpec().PiecewiseTripleCompare(other, arch_changed,
-                                                vendor_changed, os_changed,
-                                                os_ver_changed, env_changed);
-
-        if (!arch_changed && !vendor_changed && !os_changed && !env_changed)
+        if (m_arch.GetSpec().GetTriple() == other.GetTriple())
           replace_local_arch = false;
       }
     }
diff --git a/lldb/source/Utility/ArchSpec.cpp b/lldb/source/Utility/ArchSpec.cpp
index fb0e985a0d56..07ef435ef451 100644
--- a/lldb/source/Utility/ArchSpec.cpp
+++ b/lldb/source/Utility/ArchSpec.cpp
@@ -1421,23 +1421,6 @@ bool ArchSpec::IsFullySpecifiedTriple() const {
   return true;
 }
 
-void ArchSpec::PiecewiseTripleCompare(
-    const ArchSpec &other, bool &arch_different, bool &vendor_different,
-    bool &os_different, bool &os_version_different, bool &env_different) const {
-  const llvm::Triple &me(GetTriple());
-  const llvm::Triple &them(other.GetTriple());
-
-  arch_different = (me.getArch() != them.getArch());
-
-  vendor_different = (me.getVendor() != them.getVendor());
-
-  os_different = (me.getOS() != them.getOS());
-
-  os_version_different = (me.getOSMajorVersion() != them.getOSMajorVersion());
-
-  env_different = (me.getEnvironment() != them.getEnvironment());
-}
-
 bool ArchSpec::IsAlwaysThumbInstructions() const {
   std::string Status;
   if (GetTriple().getArch() == llvm::Triple::arm ||
diff --git a/lldb/test/API/macosx/arm64e-attach/Makefile b/lldb/test/API/macosx/arm64e-attach/Makefile
new file mode 100644
index 000000000000..b24dccd7c02a
--- /dev/null
+++ b/lldb/test/API/macosx/arm64e-attach/Makefile
@@ -0,0 +1,7 @@
+C_SOURCES := main.c
+
+# Uncomment this for local debugging.
+#all:
+#	xcrun clang -g $(SRCDIR)/main.c -o a.out -target arm64e-apple-macosx
+
+include Makefile.rules
diff --git a/lldb/test/API/macosx/arm64e-attach/TestArm64eAttach.py b/lldb/test/API/macosx/arm64e-attach/TestArm64eAttach.py
new file mode 100644
index 000000000000..3ec977b507b8
--- /dev/null
+++ b/lldb/test/API/macosx/arm64e-attach/TestArm64eAttach.py
@@ -0,0 +1,35 @@
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class TestArm64eAttach(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    # On Darwin systems, arch arm64e means ARMv8.3 with ptrauth ABI used.
+    @skipIf(archs=no_match(["arm64e"]))
+    def test(self):
+        # Skip this test if not running on AArch64 target that supports PAC
+        if not self.isAArch64PAuth():
+            self.skipTest("Target must support pointer authentication.")
+
+        self.build()
+        popen = self.spawnSubprocess(self.getBuildArtifact(), [])
+
+        # This simulates how Xcode attaches to a process by pid/name.
+        error = lldb.SBError()
+        target = self.dbg.CreateTarget("", "arm64", "", True, error)
+        listener = lldb.SBListener("my.attach.listener")
+        process = target.AttachToProcessWithID(listener, popen.pid, error)
+        self.assertSuccess(error)
+        self.assertTrue(process, PROCESS_IS_VALID)
+        self.assertEqual(target.GetTriple().split('-')[0], "arm64e",
+                         "target triple is updated correctly")
+
+        self.expect('process plugin packet send qProcessInfo',
+                    "debugserver returns correct triple",
+                    substrs=['cputype:100000c', 'cpusubtype:2', 'ptrsize:8'])
+
+        error = process.Kill()
+        self.assertSuccess(error)
diff --git a/lldb/test/API/macosx/arm64e-attach/main.c b/lldb/test/API/macosx/arm64e-attach/main.c
new file mode 100644
index 000000000000..7baf2ffd8f28
--- /dev/null
+++ b/lldb/test/API/macosx/arm64e-attach/main.c
@@ -0,0 +1,2 @@
+int getchar();
+int main() { return getchar(); }
diff --git a/lldb/test/API/repl/clang/TestClangREPL.py b/lldb/test/API/repl/clang/TestClangREPL.py
index 0b67955a7833..c37557fb9473 100644
--- a/lldb/test/API/repl/clang/TestClangREPL.py
+++ b/lldb/test/API/repl/clang/TestClangREPL.py
@@ -1,7 +1,6 @@
-import lldb
 from lldbsuite.test.decorators import *
-from lldbsuite.test.lldbtest import *
 from lldbsuite.test.lldbpexpect import PExpectTest
+from lldbsuite.test.lldbtest import *
 
 
 class TestCase(PExpectTest):
@@ -17,13 +16,7 @@ class TestCase(PExpectTest):
         self.current_repl_line_number += 1
         self.child.expect_exact(str(self.current_repl_line_number) + ">")
 
-    # PExpect uses many timeouts internally and doesn't play well
-    # under ASAN on a loaded machine..
-    @skipIfAsan
-    @skipIf(oslist=["linux"], archs=["arm", "aarch64"])  # Randomly fails on buildbot
-    @skipIfEditlineSupportMissing
-    def test_basic_completion(self):
-        """Test that we can complete a simple multiline expression"""
+    def start_repl(self):
         self.build()
         self.current_repl_line_number = 1
 
@@ -41,6 +34,14 @@ class TestCase(PExpectTest):
         self.child.send("expression --repl -l c --\n")
         self.child.expect_exact("1>")
 
+    # PExpect uses many timeouts internally and doesn't play well
+    # under ASAN on a loaded machine..
+    @skipIfAsan
+    @skipIf(oslist=["linux"], archs=["arm", "aarch64"])  # Randomly fails on buildbot
+    @skipIfEditlineSupportMissing
+    def test_basic_completion(self):
+        """Test that we can complete a simple multiline expression"""
+        self.start_repl()
         # Try evaluating a simple expression.
         self.expect_repl("3 + 3", substrs=["(int) $0 = 6"])
 
@@ -54,3 +55,16 @@ class TestCase(PExpectTest):
         self.expect_repl("$persistent + 10", substrs=["(long) $2 = 17"])
 
         self.quit()
+
+    # PExpect uses many timeouts internally and doesn't play well
+    # under ASAN on a loaded machine..
+    @skipIfAsan
+    @skipIf(oslist=["linux"], archs=["arm", "aarch64"])  # Randomly fails on buildbot
+    @skipIfEditlineSupportMissing
+    def test_completion_with_space_only_line(self):
+        """Test that we don't crash when completing lines with spaces only"""
+        self.start_repl()
+
+        self.child.send("   ")
+        self.child.send("\t")
+        self.expect_repl("3 + 3", substrs=["(int) $0 = 6"])
diff --git a/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py b/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py
index 8dcda75d0a45..226b9385fe71 100644
--- a/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py
+++ b/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py
@@ -1,5 +1,4 @@
 import os
-import unittest
 
 import dap_server
 import lldbdap_testcase
@@ -7,7 +6,6 @@ from lldbsuite.test import lldbtest, lldbutil
 from lldbsuite.test.decorators import *
 
 
-@unittest.skip("https://llvm.org/PR81686")
 class TestDAP_commands(lldbdap_testcase.DAPTestCaseBase):
     def test_command_directive_quiet_on_success(self):
         program = self.getBuildArtifact("a.out")
diff --git a/lldb/test/API/tools/lldb-server/TestNonStop.py b/lldb/test/API/tools/lldb-server/TestNonStop.py
index b2d136147722..62bda48ee049 100644
--- a/lldb/test/API/tools/lldb-server/TestNonStop.py
+++ b/lldb/test/API/tools/lldb-server/TestNonStop.py
@@ -220,15 +220,15 @@ class LldbGdbServerTestCase(gdbremote_testcase.GdbRemoteTestCaseBase):
         self.expect_gdbremote_sequence()
 
     @add_test_categories(["llgs"])
-    def test_multiple_C(self):
+    def test_multiple_C_continue_with_signal(self):
         self.multiple_resume_test("C05")
 
     @add_test_categories(["llgs"])
-    def test_multiple_c(self):
+    def test_multiple_c_continue_with_addr(self):
         self.multiple_resume_test("c")
 
     @add_test_categories(["llgs"])
-    def test_multiple_s(self):
+    def test_multiple_s_single_step_with_addr(self):
         self.multiple_resume_test("s")
 
     @skipIfWindows
diff --git a/lldb/tools/debugserver/source/CMakeLists.txt b/lldb/tools/debugserver/source/CMakeLists.txt
index f0b9756becab..1a433898f6aa 100644
--- a/lldb/tools/debugserver/source/CMakeLists.txt
+++ b/lldb/tools/debugserver/source/CMakeLists.txt
@@ -117,10 +117,6 @@ get_debugserver_codesign_identity(debugserver_codesign_identity)
 # Override locally, so the identity is used for targets created in this scope.
 set(LLVM_CODESIGNING_IDENTITY ${debugserver_codesign_identity})
 
-# Use the same identity later in the test suite.
-set_property(GLOBAL PROPERTY
-  LLDB_DEBUGSERVER_CODESIGN_IDENTITY ${debugserver_codesign_identity})
-
 if(APPLE)
   set(LIBCOMPRESSION compression)
   if(APPLE_EMBEDDED)
diff --git a/lldb/tools/debugserver/source/DNB.cpp b/lldb/tools/debugserver/source/DNB.cpp
index 0ec50df42d1f..1ac9a8040c61 100644
--- a/lldb/tools/debugserver/source/DNB.cpp
+++ b/lldb/tools/debugserver/source/DNB.cpp
@@ -1062,6 +1062,14 @@ DNBGetTSDAddressForThread(nub_process_t pid, nub_thread_t tid,
   return INVALID_NUB_ADDRESS;
 }
 
+std::optional<std::pair<cpu_type_t, cpu_subtype_t>>
+DNBGetMainBinaryCPUTypes(nub_process_t pid) {
+  MachProcessSP procSP;
+  if (GetProcessSP(pid, procSP))
+    return procSP->GetMainBinaryCPUTypes(pid);
+  return {};
+}
+
 JSONGenerator::ObjectSP
 DNBGetAllLoadedLibrariesInfos(nub_process_t pid, bool report_load_commands) {
   MachProcessSP procSP;
diff --git a/lldb/tools/debugserver/source/DNB.h b/lldb/tools/debugserver/source/DNB.h
index 97de83ef9ff8..10d1f6879435 100644
--- a/lldb/tools/debugserver/source/DNB.h
+++ b/lldb/tools/debugserver/source/DNB.h
@@ -210,6 +210,8 @@ DNBGetTSDAddressForThread(nub_process_t pid, nub_thread_t tid,
                           uint64_t plo_pthread_tsd_base_address_offset,
                           uint64_t plo_pthread_tsd_base_offset,
                           uint64_t plo_pthread_tsd_entry_size);
+std::optional<std::pair<cpu_type_t, cpu_subtype_t>>
+DNBGetMainBinaryCPUTypes(nub_process_t pid);
 JSONGenerator::ObjectSP
 DNBGetAllLoadedLibrariesInfos(nub_process_t pid, bool report_load_commands);
 JSONGenerator::ObjectSP
diff --git a/lldb/tools/debugserver/source/MacOSX/MachProcess.h b/lldb/tools/debugserver/source/MacOSX/MachProcess.h
index 8432bdb36c0c..db673693a1b2 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachProcess.h
+++ b/lldb/tools/debugserver/source/MacOSX/MachProcess.h
@@ -103,6 +103,8 @@ public:
       const char *stdin_path, const char *stdout_path, const char *stderr_path,
       bool no_stdio, MachProcess *process, int disable_aslr, DNBError &err);
   nub_addr_t GetDYLDAllImageInfosAddress();
+  std::optional<std::pair<cpu_type_t, cpu_subtype_t>>
+  GetMainBinaryCPUTypes(nub_process_t pid);
   static const void *PrepareForAttach(const char *path,
                                       nub_launch_flavor_t launch_flavor,
                                       bool waitfor, DNBError &err_str);
diff --git a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
index 3a4dfb9ea6ea..87bdbf835bfd 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
+++ b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
@@ -1111,6 +1111,23 @@ MachProcess::GetAllLoadedLibrariesInfos(nub_process_t pid,
   return FormatDynamicLibrariesIntoJSON(image_infos, report_load_commands);
 }
 
+std::optional<std::pair<cpu_type_t, cpu_subtype_t>>
+MachProcess::GetMainBinaryCPUTypes(nub_process_t pid) {
+  int pointer_size = GetInferiorAddrSize(pid);
+  std::vector<struct binary_image_information> image_infos;
+  GetAllLoadedBinariesViaDYLDSPI(image_infos);
+  uint32_t platform = GetPlatform();
+  for (auto &image_info : image_infos)
+    if (GetMachOInformationFromMemory(platform, image_info.load_address,
+                                      pointer_size, image_info.macho_info))
+      if (image_info.macho_info.mach_header.filetype == MH_EXECUTE)
+        return {
+            {static_cast<cpu_type_t>(image_info.macho_info.mach_header.cputype),
+             static_cast<cpu_subtype_t>(
+                 image_info.macho_info.mach_header.cpusubtype)}};
+  return {};
+}
+
 // Fetch information about the shared libraries at the given load addresses
 // using the
 // dyld SPIs that exist in macOS 10.12, iOS 10, tvOS 10, watchOS 3 and newer.
diff --git a/lldb/tools/debugserver/source/RNBRemote.cpp b/lldb/tools/debugserver/source/RNBRemote.cpp
index feea4c914ec5..03d427d3fc59 100644
--- a/lldb/tools/debugserver/source/RNBRemote.cpp
+++ b/lldb/tools/debugserver/source/RNBRemote.cpp
@@ -6195,59 +6195,14 @@ rnb_err_t RNBRemote::HandlePacket_qSymbol(const char *command) {
   }
 }
 
-// Note that all numeric values returned by qProcessInfo are hex encoded,
-// including the pid and the cpu type.
-
-rnb_err_t RNBRemote::HandlePacket_qProcessInfo(const char *p) {
-  nub_process_t pid;
-  std::ostringstream rep;
-
-  // If we haven't run the process yet, return an error.
-  if (!m_ctx.HasValidProcessID())
-    return SendPacket("E68");
-
-  pid = m_ctx.ProcessID();
-
-  rep << "pid:" << std::hex << pid << ';';
-
-  int procpid_mib[4];
-  procpid_mib[0] = CTL_KERN;
-  procpid_mib[1] = KERN_PROC;
-  procpid_mib[2] = KERN_PROC_PID;
-  procpid_mib[3] = pid;
-  struct kinfo_proc proc_kinfo;
-  size_t proc_kinfo_size = sizeof(struct kinfo_proc);
-
-  if (::sysctl(procpid_mib, 4, &proc_kinfo, &proc_kinfo_size, NULL, 0) == 0) {
-    if (proc_kinfo_size > 0) {
-      rep << "parent-pid:" << std::hex << proc_kinfo.kp_eproc.e_ppid << ';';
-      rep << "real-uid:" << std::hex << proc_kinfo.kp_eproc.e_pcred.p_ruid
-          << ';';
-      rep << "real-gid:" << std::hex << proc_kinfo.kp_eproc.e_pcred.p_rgid
-          << ';';
-      rep << "effective-uid:" << std::hex << proc_kinfo.kp_eproc.e_ucred.cr_uid
-          << ';';
-      if (proc_kinfo.kp_eproc.e_ucred.cr_ngroups > 0)
-        rep << "effective-gid:" << std::hex
-            << proc_kinfo.kp_eproc.e_ucred.cr_groups[0] << ';';
-    }
-  }
-
+static std::pair<cpu_type_t, cpu_subtype_t>
+GetCPUTypesFromHost(nub_process_t pid) {
   cpu_type_t cputype = DNBProcessGetCPUType(pid);
   if (cputype == 0) {
     DNBLog("Unable to get the process cpu_type, making a best guess.");
     cputype = best_guess_cpu_type();
   }
 
-  uint32_t addr_size = 0;
-  if (cputype != 0) {
-    rep << "cputype:" << std::hex << cputype << ";";
-    if (cputype & CPU_ARCH_ABI64)
-      addr_size = 8;
-    else
-      addr_size = 4;
-  }
-
   bool host_cpu_is_64bit = false;
   uint32_t is64bit_capable;
   size_t is64bit_capable_len = sizeof(is64bit_capable);
@@ -6288,14 +6243,69 @@ rnb_err_t RNBRemote::HandlePacket_qProcessInfo(const char *p) {
     if (cputype == CPU_TYPE_ARM64_32 && cpusubtype == 2)
       cpusubtype = CPU_SUBTYPE_ARM64_32_V8;
 #endif
+  }
+
+  return {cputype, cpusubtype};
+}
+
+// Note that all numeric values returned by qProcessInfo are hex encoded,
+// including the pid and the cpu type.
 
+rnb_err_t RNBRemote::HandlePacket_qProcessInfo(const char *p) {
+  nub_process_t pid;
+  std::ostringstream rep;
+
+  // If we haven't run the process yet, return an error.
+  if (!m_ctx.HasValidProcessID())
+    return SendPacket("E68");
+
+  pid = m_ctx.ProcessID();
+
+  rep << "pid:" << std::hex << pid << ';';
+
+  int procpid_mib[4];
+  procpid_mib[0] = CTL_KERN;
+  procpid_mib[1] = KERN_PROC;
+  procpid_mib[2] = KERN_PROC_PID;
+  procpid_mib[3] = pid;
+  struct kinfo_proc proc_kinfo;
+  size_t proc_kinfo_size = sizeof(struct kinfo_proc);
+
+  if (::sysctl(procpid_mib, 4, &proc_kinfo, &proc_kinfo_size, NULL, 0) == 0) {
+    if (proc_kinfo_size > 0) {
+      rep << "parent-pid:" << std::hex << proc_kinfo.kp_eproc.e_ppid << ';';
+      rep << "real-uid:" << std::hex << proc_kinfo.kp_eproc.e_pcred.p_ruid
+          << ';';
+      rep << "real-gid:" << std::hex << proc_kinfo.kp_eproc.e_pcred.p_rgid
+          << ';';
+      rep << "effective-uid:" << std::hex << proc_kinfo.kp_eproc.e_ucred.cr_uid
+          << ';';
+      if (proc_kinfo.kp_eproc.e_ucred.cr_ngroups > 0)
+        rep << "effective-gid:" << std::hex
+            << proc_kinfo.kp_eproc.e_ucred.cr_groups[0] << ';';
+    }
+  }
+
+  cpu_type_t cputype;
+  cpu_subtype_t cpusubtype;
+  if (auto cputypes = DNBGetMainBinaryCPUTypes(pid))
+    std::tie(cputype, cpusubtype) = *cputypes;
+  else
+    std::tie(cputype, cpusubtype) = GetCPUTypesFromHost(pid);
+
+  uint32_t addr_size = 0;
+  if (cputype != 0) {
+    rep << "cputype:" << std::hex << cputype << ";";
     rep << "cpusubtype:" << std::hex << cpusubtype << ';';
+    if (cputype & CPU_ARCH_ABI64)
+      addr_size = 8;
+    else
+      addr_size = 4;
   }
 
   bool os_handled = false;
   if (addr_size > 0) {
     rep << "ptrsize:" << std::dec << addr_size << ';';
-
 #if defined(TARGET_OS_OSX) && TARGET_OS_OSX == 1
     // Try and get the OS type by looking at the load commands in the main
     // executable and looking for a LC_VERSION_MIN load command. This is the
diff --git a/lldb/tools/lldb-dap/LLDBUtils.cpp b/lldb/tools/lldb-dap/LLDBUtils.cpp
index 35b7a986a896..a91cc6718f4d 100644
--- a/lldb/tools/lldb-dap/LLDBUtils.cpp
+++ b/lldb/tools/lldb-dap/LLDBUtils.cpp
@@ -9,6 +9,8 @@
 #include "LLDBUtils.h"
 #include "DAP.h"
 
+#include <mutex>
+
 namespace lldb_dap {
 
 bool RunLLDBCommands(llvm::StringRef prefix,
@@ -37,7 +39,15 @@ bool RunLLDBCommands(llvm::StringRef prefix,
       }
     }
 
-    interp.HandleCommand(command.str().c_str(), result);
+    {
+      // Prevent simultaneous calls to HandleCommand, e.g. EventThreadFunction
+      // may asynchronously call RunExitCommands when we are already calling
+      // RunTerminateCommands.
+      static std::mutex handle_command_mutex;
+      std::lock_guard<std::mutex> locker(handle_command_mutex);
+      interp.HandleCommand(command.str().c_str(), result);
+    }
+
     const bool got_error = !result.Succeeded();
     // The if statement below is assuming we always print out `!` prefixed
     // lines. The only time we don't print is when we have `quiet_on_success ==
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index f5f7d3f3253f..651f17879fad 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -718,6 +718,8 @@ if(LLVM_INDIVIDUAL_TEST_COVERAGE)
 endif()
 set(LLVM_LIT_ARGS "${LIT_ARGS_DEFAULT}" CACHE STRING "Default options for lit")
 
+option(LLVM_PARALLEL_LIT "Enable multiple lit suites to run in parallel" OFF)
+
 # On Win32 hosts, provide an option to specify the path to the GnuWin32 tools.
 if( WIN32 AND NOT CYGWIN )
   set(LLVM_LIT_TOOLS_DIR "" CACHE PATH "Path to GnuWin32 tools")
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 3bc78b0dc935..0f1734a64ee6 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -1947,11 +1947,18 @@ function(add_lit_target target comment)
     list(APPEND LIT_COMMAND --param ${param})
   endforeach()
   if (ARG_UNPARSED_ARGUMENTS)
-    add_custom_target(${target}
-      COMMAND ${LIT_COMMAND} ${ARG_UNPARSED_ARGUMENTS}
-      COMMENT "${comment}"
-      USES_TERMINAL
-      )
+    if (LLVM_PARALLEL_LIT)
+     add_custom_target(${target}
+       COMMAND ${LIT_COMMAND} ${ARG_UNPARSED_ARGUMENTS}
+       COMMENT "${comment}"
+       )
+    else()
+     add_custom_target(${target}
+       COMMAND ${LIT_COMMAND} ${ARG_UNPARSED_ARGUMENTS}
+       COMMENT "${comment}"
+       USES_TERMINAL
+       )
+    endif()
   else()
     add_custom_target(${target}
       COMMAND ${CMAKE_COMMAND} -E echo "${target} does nothing, no tools built.")
diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst
index abef4f810314..35c47989a7ee 100644
--- a/llvm/docs/CMake.rst
+++ b/llvm/docs/CMake.rst
@@ -762,6 +762,12 @@ enabled sub-projects. Nearly all of these variable names begin with
 **LLVM_PARALLEL_LINK_JOBS**:STRING
   Define the maximum number of concurrent link jobs.
 
+**LLVM_PARALLEL_LIT**:BOOL
+  Defaults to ``OFF``. If set to ``OFF``, lit testsuites will be configured
+  with CMake's ``USES_TERMINAL`` flag to give direct access to the terminal. If
+  set to ``ON``, that flag will be removed allowing Ninja to schedule multiple
+  lit testsuites in parallel.
+
 **LLVM_RAM_PER_COMPILE_JOB**:STRING
   Calculates the amount of Ninja compile jobs according to available resources.
   Value has to be in MB, overwrites LLVM_PARALLEL_COMPILE_JOBS. Compile jobs 
diff --git a/llvm/docs/CommandGuide/llvm-ar.rst b/llvm/docs/CommandGuide/llvm-ar.rst
index f643b214bcc8..03d5b9e41ada 100644
--- a/llvm/docs/CommandGuide/llvm-ar.rst
+++ b/llvm/docs/CommandGuide/llvm-ar.rst
@@ -262,8 +262,9 @@ Other
 .. option:: --format=<type>
 
  This option allows for default, gnu, darwin or bsd ``<type>`` to be selected.
- When creating an ``archive``, ``<type>`` will default to that of the host
- machine.
+ When creating an ``archive`` with the default ``<type>``, :program:``llvm-ar``
+ will attempt to infer it from the input files and fallback to the default
+ toolchain target if unable to do so.
 
 .. option:: -h, --help
 
diff --git a/llvm/docs/CommandGuide/llvm-exegesis.rst b/llvm/docs/CommandGuide/llvm-exegesis.rst
index 9e3c19078f1c..fdf17c7fe412 100644
--- a/llvm/docs/CommandGuide/llvm-exegesis.rst
+++ b/llvm/docs/CommandGuide/llvm-exegesis.rst
@@ -89,6 +89,14 @@ properly.
   annotation requires the subprocess execution mode. This is useful in
   cases where the memory accessed by the snippet depends on the location
   of the snippet, like RIP-relative addressing.
+* `LLVM-EXEGESIS-LOOP-REGISTER <register name>` - This annotation specifies
+  the loop register to use for keeping track of the current iteration when
+  using the loop repetition mode. :program:`llvm-exegesis` needs to keep track
+  of the current loop iteration within the loop repetition mode in a performant
+  manner (i.e., no memory accesses), and uses a register to do this. This register
+  has an architecture specific default (e.g., `R8` on X86), but this might conflict
+  with some snippets. This annotation allows changing the register to prevent
+  interference between the loop index register and the snippet.
 
 EXAMPLE 1: benchmarking instructions
 ------------------------------------
diff --git a/llvm/docs/CommandGuide/llvm-objcopy.rst b/llvm/docs/CommandGuide/llvm-objcopy.rst
index 0fb3c4bed643..9d0cb7ad1195 100644
--- a/llvm/docs/CommandGuide/llvm-objcopy.rst
+++ b/llvm/docs/CommandGuide/llvm-objcopy.rst
@@ -450,11 +450,20 @@ them.
  Set the type of section ``<section>`` to the integer ``<type>``. Can be
  specified multiple times to update multiple sections.
 
-.. option:: --set-start-addr <addr>
+.. option:: --set-start <addr>
 
  Set the start address of the output to ``<addr>``. Overrides any previously
  specified :option:`--change-start` or :option:`--adjust-start` options.
 
+.. option:: --set-symbol-visibility <symbol>=<visibility>
+
+ Change the visibility of a symbol to the specified value.
+
+.. option:: --set-symbols-visibility <filename>=<visibility>
+
+ Read a list of symbols from <filename> and change their visibility to the
+ specified value. Visibility values: default, internal, hidden, protected.
+
 .. option:: --split-dwo <dwo-file>
 
  Equivalent to running :program:`llvm-objcopy` with :option:`--extract-dwo` and
@@ -484,7 +493,7 @@ them.
 
 .. option:: --weaken-symbol <symbol>, -W
 
- Mark any global symbol named ``<symbol>`` as a weak symbol in the output. Can
+ Mark global symbols named ``<symbol>`` as weak symbols in the output. Can
  be specified multiple times to mark multiple symbols as weak.
 
 .. option:: --weaken-symbols <filename>
diff --git a/llvm/docs/CommandGuide/llvm-objdump.rst b/llvm/docs/CommandGuide/llvm-objdump.rst
index 959452a74b23..7f8def756c69 100644
--- a/llvm/docs/CommandGuide/llvm-objdump.rst
+++ b/llvm/docs/CommandGuide/llvm-objdump.rst
@@ -271,7 +271,12 @@ OPTIONS
 
   When printing a PC-relative global symbol reference, print it as an offset from the leading symbol.
 
-  When a bb-address-map section is present (i.e., the object file is built with ``-fbasic-block-sections=labels``), labels are retrieved from that section instead.
+  When a bb-address-map section is present (i.e., the object file is built with
+  ``-fbasic-block-sections=labels``), labels are retrieved from that section
+  instead. If a pgo-analysis-map is present alongside the bb-address-map, any
+  available analyses are printed after the relevant block label. By default,
+  any analysis with a special representation (i.e. BlockFrequency,
+  BranchProbability, etc) are printed as raw hex values.
 
   Only works with PowerPC objects or X86 linked images.
 
@@ -291,6 +296,15 @@ OPTIONS
        cmp eax, dword ptr <g>
        jge	<L0>
 
+.. option:: --pretty-pgo-analysis-map
+
+  When using :option:`--symbolize-operands` with bb-address-map and
+  pgo-analysis-map, print analyses using the same format as their analysis
+  passes would. An example of pretty format would be printing block frequencies
+  relative to the entry block, the same as BFI.
+
+  Only works when :option:`--symbolize-operands` is enabled.
+
 .. option:: --triple=<string>
 
   Target triple to disassemble for, see ``--version`` for available targets.
diff --git a/llvm/docs/CommandGuide/llvm-readobj.rst b/llvm/docs/CommandGuide/llvm-readobj.rst
index 6d78a0387234..09dabb28cfa7 100644
--- a/llvm/docs/CommandGuide/llvm-readobj.rst
+++ b/llvm/docs/CommandGuide/llvm-readobj.rst
@@ -164,6 +164,17 @@ The following options are implemented only for the ELF file format.
  Display the contents of the basic block address map section(s), which contain the
  address of each function, along with the relative offset of each basic block.
 
+ When pgo analysis maps are present, all analyses are printed as their raw
+ value.
+
+.. option:: --pretty-pgo-analysis-map
+
+ When pgo analysis maps are present in the basic block address map section(s),
+ analyses with special formats (i.e. BlockFrequency, BranchProbability, etc)
+ are printed using the same format as their respective analysis pass.
+
+ Requires :option:`--bb-addr-map` to have an effect.
+
 .. option:: --demangle, -C
 
  Display demangled symbol names in the output.
diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst
index 811b324ebad9..f89483904ab7 100644
--- a/llvm/docs/GettingInvolved.rst
+++ b/llvm/docs/GettingInvolved.rst
@@ -69,7 +69,6 @@ Information about LLVM's development process.
 
    Projects
    HowToReleaseLLVM
-   Packaging
    ReleaseProcess
    HowToAddABuilder
    ReleaseNotes
@@ -89,9 +88,6 @@ Information about LLVM's development process.
 :doc:`HowToAddABuilder`
    Instructions for adding new builder to LLVM buildbot master.
 
-:doc:`Packaging`
-   Advice on packaging LLVM into a distribution.
-
 :doc:`Release notes for the current release <ReleaseNotes>`
    This describes new features, known bugs, and other limitations.
 
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 570a058bde8d..60e682ae328a 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -4364,7 +4364,7 @@ constants and smaller complex constants.
 
     When creating a vector whose elements have the same constant value, the
     preferred syntax is ``splat (<Ty> Val)``. For example: "``splat (i32 11)``".
-    These vector constants must have ::ref:`vector type <t_vector>` with an
+    These vector constants must have :ref:`vector type <t_vector>` with an
     element type that matches the ``splat`` operand.
 **Zero initialization**
     The string '``zeroinitializer``' can be used to zero initialize a
diff --git a/llvm/docs/Packaging.rst b/llvm/docs/Packaging.rst
deleted file mode 100644
index 176e5b391229..000000000000
--- a/llvm/docs/Packaging.rst
+++ /dev/null
@@ -1,73 +0,0 @@
-========================
-Advice on Packaging LLVM
-========================
-
-.. contents::
-   :local:
-
-Overview
-========
-
-LLVM sets certain default configure options to make sure our developers don't
-break things for constrained platforms.  These settings are not optimal for most
-desktop systems, and we hope that packagers (e.g., Redhat, Debian, MacPorts,
-etc.) will tweak them.  This document lists settings we suggest you tweak.
-
-LLVM's API changes with each release, so users are likely to want, for example,
-both LLVM-2.6 and LLVM-2.7 installed at the same time to support apps developed
-against each.
-
-Compile Flags
-=============
-
-LLVM runs much more quickly when it's optimized and assertions are removed.
-However, such a build is currently incompatible with users who build without
-defining ``NDEBUG``, and the lack of assertions makes it hard to debug problems
-in user code.  We recommend allowing users to install both optimized and debug
-versions of LLVM in parallel.  The following configure flags are relevant:
-
-``--disable-assertions``
-    Builds LLVM with ``NDEBUG`` defined.  Changes the LLVM ABI.  Also available
-    by setting ``DISABLE_ASSERTIONS=0|1`` in ``make``'s environment.  This
-    defaults to enabled regardless of the optimization setting, but it slows
-    things down.
-
-``--enable-debug-symbols``
-    Builds LLVM with ``-g``.  Also available by setting ``DEBUG_SYMBOLS=0|1`` in
-    ``make``'s environment.  This defaults to disabled when optimizing, so you
-    should turn it back on to let users debug their programs.
-
-``--enable-optimized``
-    (For git checkouts) Builds LLVM with ``-O2`` and, by default, turns off
-    debug symbols.  Also available by setting ``ENABLE_OPTIMIZED=0|1`` in
-    ``make``'s environment.  This defaults to enabled when not in a
-    checkout.
-
-C++ Features
-============
-
-RTTI
-    LLVM disables RTTI by default.  Add ``REQUIRES_RTTI=1`` to your environment
-    while running ``make`` to re-enable it.  This will allow users to build with
-    RTTI enabled and still inherit from LLVM classes.
-
-Shared Library
-==============
-
-Configure with ``--enable-shared`` to build
-``libLLVM-<major>.<minor>.(so|dylib)`` and link the tools against it.  This
-saves lots of binary size at the cost of some startup time.
-
-Dependencies
-============
-
-``--enable-libffi``
-    Depend on `libffi <http://sources.redhat.com/libffi/>`_ to allow the LLVM
-    interpreter to call external functions.
-
-``--with-oprofile``
-
-    Depend on `libopagent
-    <http://oprofile.sourceforge.net/doc/devel/index.html>`_ (>=version 0.9.4)
-    to let the LLVM JIT tell oprofile about function addresses and line
-    numbers.
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 5be00d9d5a58..51b6527f65bb 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -143,6 +143,14 @@ Changes to the LLVM tools
   files using reference types and GC are also supported (but also only for
   functions, globals, and data, and only for listing symbols and names).
 
+* llvm-ar now utilizes LLVM_DEFAULT_TARGET_TRIPLE to determine the archive format
+  if it's not specified with the ``--format`` argument and cannot be inferred from
+  input files.
+
+* llvm-objcopy now supports ``--set-symbol-visibility`` and
+  ``--set-symbols-visibility`` options for ELF input to change the
+  visibility of symbols.
+
 Changes to LLDB
 ---------------------------------
 
diff --git a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
index 8acb75e87254..4aa922635c37 100644
--- a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
+++ b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
@@ -539,9 +539,6 @@ public:
   }
 };
 
-void printBlockFreqImpl(raw_ostream &OS, BlockFrequency EntryFreq,
-                        BlockFrequency Freq);
-
 namespace bfi_detail {
 
 template <class BlockT> struct TypeMap {};
diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h
index c3dcd568216b..a28e19edb4c6 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainer.h
+++ b/llvm/include/llvm/BinaryFormat/DXContainer.h
@@ -141,7 +141,7 @@ enum class PartType {
 #include "DXContainerConstants.def"
 };
 
-#define SHADER_FLAG(Num, Val, Str) Val = 1ull << Num,
+#define SHADER_FEATURE_FLAG(Num, Val, Str) Val = 1ull << Num,
 enum class FeatureFlags : uint64_t {
 #include "DXContainerConstants.def"
 };
diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
index 87dd0a5cb6ba..80ed86bc3a49 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
+++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
@@ -11,43 +11,43 @@ CONTAINER_PART(PSG1)
 #undef CONTAINER_PART
 #endif 
 
-#ifdef SHADER_FLAG
-
-SHADER_FLAG(0, Doubles, "Double-precision floating point")
-SHADER_FLAG(1, ComputeShadersPlusRawAndStructuredBuffers, "Raw and Structured buffers")
-SHADER_FLAG(2, UAVsAtEveryStage, "UAVs at every shader stage")
-SHADER_FLAG(3, Max64UAVs, "64 UAV slots")
-SHADER_FLAG(4, MinimumPrecision, "Minimum-precision data types")
-SHADER_FLAG(5, DX11_1_DoubleExtensions, "Double-precision extensions for 11.1")
-SHADER_FLAG(6, DX11_1_ShaderExtensions, "Shader extensions for 11.1")
-SHADER_FLAG(7, LEVEL9ComparisonFiltering, "Comparison filtering for feature level 9")
-SHADER_FLAG(8, TiledResources, "Tiled resources")
-SHADER_FLAG(9, StencilRef, "PS Output Stencil Ref")
-SHADER_FLAG(10, InnerCoverage, "PS Inner Coverage")
-SHADER_FLAG(11, TypedUAVLoadAdditionalFormats, "Typed UAV Load Additional Formats")
-SHADER_FLAG(12, ROVs, "Raster Ordered UAVs")
-SHADER_FLAG(13, ViewportAndRTArrayIndexFromAnyShaderFeedingRasterizer, "SV_RenderTargetArrayIndex or SV_ViewportArrayIndex from any shader feeding rasterizer")
-SHADER_FLAG(14, WaveOps, "Wave level operations")
-SHADER_FLAG(15, Int64Ops, "64-Bit integer")
-SHADER_FLAG(16, ViewID, "View Instancing")
-SHADER_FLAG(17, Barycentrics, "Barycentrics")
-SHADER_FLAG(18, NativeLowPrecision, "Use native low precision")
-SHADER_FLAG(19, ShadingRate, "Shading Rate")
-SHADER_FLAG(20, Raytracing_Tier_1_1, "Raytracing tier 1.1 features")
-SHADER_FLAG(21, SamplerFeedback, "Sampler feedback")
-SHADER_FLAG(22, AtomicInt64OnTypedResource, "64-bit Atomics on Typed Resources")
-SHADER_FLAG(23, AtomicInt64OnGroupShared, "64-bit Atomics on Group Shared")
-SHADER_FLAG(24, DerivativesInMeshAndAmpShaders, "Derivatives in mesh and amplification shaders")
-SHADER_FLAG(25, ResourceDescriptorHeapIndexing, "Resource descriptor heap indexing")
-SHADER_FLAG(26, SamplerDescriptorHeapIndexing, "Sampler descriptor heap indexing")
-SHADER_FLAG(27, RESERVED, "<RESERVED>")
-SHADER_FLAG(28, AtomicInt64OnHeapResource, "64-bit Atomics on Heap Resources")
-SHADER_FLAG(29, AdvancedTextureOps, "Advanced Texture Ops")
-SHADER_FLAG(30, WriteableMSAATextures, "Writeable MSAA Textures")
-
-SHADER_FLAG(31, NextUnusedBit, "Next reserved shader flag bit (not a flag)")
-
-#undef SHADER_FLAG
+#ifdef SHADER_FEATURE_FLAG
+
+SHADER_FEATURE_FLAG(0, Doubles, "Double-precision floating point")
+SHADER_FEATURE_FLAG(1, ComputeShadersPlusRawAndStructuredBuffers, "Raw and Structured buffers")
+SHADER_FEATURE_FLAG(2, UAVsAtEveryStage, "UAVs at every shader stage")
+SHADER_FEATURE_FLAG(3, Max64UAVs, "64 UAV slots")
+SHADER_FEATURE_FLAG(4, MinimumPrecision, "Minimum-precision data types")
+SHADER_FEATURE_FLAG(5, DX11_1_DoubleExtensions, "Double-precision extensions for 11.1")
+SHADER_FEATURE_FLAG(6, DX11_1_ShaderExtensions, "Shader extensions for 11.1")
+SHADER_FEATURE_FLAG(7, LEVEL9ComparisonFiltering, "Comparison filtering for feature level 9")
+SHADER_FEATURE_FLAG(8, TiledResources, "Tiled resources")
+SHADER_FEATURE_FLAG(9, StencilRef, "PS Output Stencil Ref")
+SHADER_FEATURE_FLAG(10, InnerCoverage, "PS Inner Coverage")
+SHADER_FEATURE_FLAG(11, TypedUAVLoadAdditionalFormats, "Typed UAV Load Additional Formats")
+SHADER_FEATURE_FLAG(12, ROVs, "Raster Ordered UAVs")
+SHADER_FEATURE_FLAG(13, ViewportAndRTArrayIndexFromAnyShaderFeedingRasterizer, "SV_RenderTargetArrayIndex or SV_ViewportArrayIndex from any shader feeding rasterizer")
+SHADER_FEATURE_FLAG(14, WaveOps, "Wave level operations")
+SHADER_FEATURE_FLAG(15, Int64Ops, "64-Bit integer")
+SHADER_FEATURE_FLAG(16, ViewID, "View Instancing")
+SHADER_FEATURE_FLAG(17, Barycentrics, "Barycentrics")
+SHADER_FEATURE_FLAG(18, NativeLowPrecision, "Use native low precision")
+SHADER_FEATURE_FLAG(19, ShadingRate, "Shading Rate")
+SHADER_FEATURE_FLAG(20, Raytracing_Tier_1_1, "Raytracing tier 1.1 features")
+SHADER_FEATURE_FLAG(21, SamplerFeedback, "Sampler feedback")
+SHADER_FEATURE_FLAG(22, AtomicInt64OnTypedResource, "64-bit Atomics on Typed Resources")
+SHADER_FEATURE_FLAG(23, AtomicInt64OnGroupShared, "64-bit Atomics on Group Shared")
+SHADER_FEATURE_FLAG(24, DerivativesInMeshAndAmpShaders, "Derivatives in mesh and amplification shaders")
+SHADER_FEATURE_FLAG(25, ResourceDescriptorHeapIndexing, "Resource descriptor heap indexing")
+SHADER_FEATURE_FLAG(26, SamplerDescriptorHeapIndexing, "Sampler descriptor heap indexing")
+SHADER_FEATURE_FLAG(27, RESERVED, "<RESERVED>")
+SHADER_FEATURE_FLAG(28, AtomicInt64OnHeapResource, "64-bit Atomics on Heap Resources")
+SHADER_FEATURE_FLAG(29, AdvancedTextureOps, "Advanced Texture Ops")
+SHADER_FEATURE_FLAG(30, WriteableMSAATextures, "Writeable MSAA Textures")
+
+SHADER_FEATURE_FLAG(31, NextUnusedBit, "Next reserved shader flag bit (not a flag)")
+
+#undef SHADER_FEATURE_FLAG
 #endif
 
 #ifdef SEMANTIC_KIND
diff --git a/llvm/include/llvm/BinaryFormat/Dwarf.h b/llvm/include/llvm/BinaryFormat/Dwarf.h
index 44c0030251b3..a53e79bf6e39 100644
--- a/llvm/include/llvm/BinaryFormat/Dwarf.h
+++ b/llvm/include/llvm/BinaryFormat/Dwarf.h
@@ -615,21 +615,21 @@ enum AcceleratorTable {
 
 // Uniquify the string hashes and calculate the bucket count for the
 // DWARF v5 Accelerator Table. NOTE: This function effectively consumes the
-// 'hashes' input parameter.
-inline uint32_t getDebugNamesBucketCount(MutableArrayRef<uint32_t> hashes,
-                                         uint32_t &uniqueHashCount) {
+// 'Hashes' input parameter.
+inline std::pair<uint32_t, uint32_t>
+getDebugNamesBucketAndHashCount(MutableArrayRef<uint32_t> Hashes) {
   uint32_t BucketCount = 0;
 
-  sort(hashes);
-  uniqueHashCount = llvm::unique(hashes) - hashes.begin();
-  if (uniqueHashCount > 1024)
-    BucketCount = uniqueHashCount / 4;
-  else if (uniqueHashCount > 16)
-    BucketCount = uniqueHashCount / 2;
+  sort(Hashes);
+  uint32_t UniqueHashCount = llvm::unique(Hashes) - Hashes.begin();
+  if (UniqueHashCount > 1024)
+    BucketCount = UniqueHashCount / 4;
+  else if (UniqueHashCount > 16)
+    BucketCount = UniqueHashCount / 2;
   else
-    BucketCount = std::max<uint32_t>(uniqueHashCount, 1);
+    BucketCount = std::max<uint32_t>(UniqueHashCount, 1);
 
-  return BucketCount;
+  return {BucketCount, UniqueHashCount};
 }
 
 // Constants for the GNU pubnames/pubtypes extensions supporting gdb index.
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 5454df02914a..bfac54a65c5b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -612,6 +612,7 @@ private:
   AAResults *AA = nullptr;
   AssumptionCache *AC = nullptr;
   const TargetLibraryInfo *LibInfo = nullptr;
+  const TargetLowering *TLI = nullptr;
   FunctionLoweringInfo FuncInfo;
 
   // True when either the Target Machine specifies no optimizations or the
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 2beb9919418f..5bb3692f0a46 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -281,6 +281,10 @@ private:
                                          MachineInstr &MI,
                                          LostDebugLocObserver &LocObserver);
 
+  MachineInstrBuilder
+  getNeutralElementForVecReduce(unsigned Opcode, MachineIRBuilder &MIRBuilder,
+                                LLT Ty);
+
 public:
   /// Return the alignment to use for a stack temporary object with the given
   /// type.
diff --git a/llvm/include/llvm/CodeGen/MachineLoopInfo.h b/llvm/include/llvm/CodeGen/MachineLoopInfo.h
index ae075bee1daf..445c9b1c3bc0 100644
--- a/llvm/include/llvm/CodeGen/MachineLoopInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineLoopInfo.h
@@ -79,7 +79,10 @@ public:
   /// I.e., all virtual register operands are defined outside of the loop,
   /// physical registers aren't accessed explicitly, and there are no side
   /// effects that aren't captured by the operands or other flags.
-  bool isLoopInvariant(MachineInstr &I) const;
+  /// ExcludeReg can be used to exclude the given register from the check
+  /// i.e. when we're considering hoisting it's definition but not hoisted it
+  /// yet
+  bool isLoopInvariant(MachineInstr &I, const Register ExcludeReg = 0) const;
 
   void dump() const;
 
diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index 47127a6c2960..25703dd6b61f 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -1293,19 +1293,19 @@ class PostGenericScheduler : public GenericSchedulerBase {
 protected:
   ScheduleDAGMI *DAG = nullptr;
   SchedBoundary Top;
-  SmallVector<SUnit*, 8> BotRoots;
+  SchedBoundary Bot;
+  MachineSchedPolicy RegionPolicy;
 
 public:
-  PostGenericScheduler(const MachineSchedContext *C):
-    GenericSchedulerBase(C), Top(SchedBoundary::TopQID, "TopQ") {}
+  PostGenericScheduler(const MachineSchedContext *C)
+      : GenericSchedulerBase(C), Top(SchedBoundary::TopQID, "TopQ"),
+        Bot(SchedBoundary::BotQID, "BotQ") {}
 
   ~PostGenericScheduler() override = default;
 
   void initPolicy(MachineBasicBlock::iterator Begin,
                   MachineBasicBlock::iterator End,
-                  unsigned NumRegionInstrs) override {
-    /* no configurable policy */
-  }
+                  unsigned NumRegionInstrs) override;
 
   /// PostRA scheduling does not track pressure.
   bool shouldTrackPressure() const override { return false; }
@@ -1328,15 +1328,16 @@ public:
     Top.releaseNode(SU, SU->TopReadyCycle, false);
   }
 
-  // Only called for roots.
   void releaseBottomNode(SUnit *SU) override {
-    BotRoots.push_back(SU);
+    if (SU->isScheduled)
+      return;
+    Bot.releaseNode(SU, SU->BotReadyCycle, false);
   }
 
 protected:
   virtual bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand);
 
-  void pickNodeFromQueue(SchedCandidate &Cand);
+  void pickNodeFromQueue(SchedBoundary &Zone, SchedCandidate &Cand);
 };
 
 /// Create the standard converging machine scheduler. This will be used as the
diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
index ef7662a8e7a2..85de18f5169e 100644
--- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -191,7 +191,19 @@ namespace llvm {
     /// applicable).
     using SUList = std::list<SUnit *>;
 
+    /// The direction that should be used to dump the scheduled Sequence.
+    enum DumpDirection {
+      TopDown,
+      BottomUp,
+      Bidirectional,
+      NotSet,
+    };
+
+    void setDumpDirection(DumpDirection D) { DumpDir = D; }
+
   protected:
+    DumpDirection DumpDir = NotSet;
+
     /// A map from ValueType to SUList, used during DAG construction, as
     /// a means of remembering which SUs depend on which memory locations.
     class Value2SUsMap;
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h b/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
index 3d0f836b0c75..29de6bd8685e 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_CODEGEN_SELECTIONDAGADDRESSANALYSIS_H
 #define LLVM_CODEGEN_SELECTIONDAGADDRESSANALYSIS_H
 
+#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include <cstdint>
 
@@ -81,10 +82,8 @@ public:
 
   // Returns true `Op0` and `Op1` can be proven to alias/not alias, in
   // which case `IsAlias` is set to true/false.
-  static bool computeAliasing(const SDNode *Op0,
-                              const std::optional<int64_t> NumBytes0,
-                              const SDNode *Op1,
-                              const std::optional<int64_t> NumBytes1,
+  static bool computeAliasing(const SDNode *Op0, const LocationSize NumBytes0,
+                              const SDNode *Op1, const LocationSize NumBytes1,
                               const SelectionDAG &DAG, bool &IsAlias);
 
   /// Parses tree in N for base, index, offset addresses.
diff --git a/llvm/include/llvm/DebugInfo/DIContext.h b/llvm/include/llvm/DebugInfo/DIContext.h
index 288ddf77bdfd..b75dc8db5433 100644
--- a/llvm/include/llvm/DebugInfo/DIContext.h
+++ b/llvm/include/llvm/DebugInfo/DIContext.h
@@ -206,6 +206,7 @@ struct DIDumpOptions {
   bool IsEH = false;
   bool DumpNonSkeleton = false;
   bool ShowAggregateErrors = false;
+  std::string JsonErrSummaryFile;
   std::function<llvm::StringRef(uint64_t DwarfRegNum, bool IsEH)>
       GetNameForDWARFReg;
 
diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h
index 04bc58d8f63e..d33af157543f 100644
--- a/llvm/include/llvm/Demangle/ItaniumDemangle.h
+++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h
@@ -5540,7 +5540,7 @@ Node *AbstractManglingParser<Alloc, Derived>::parseFloatingLiteral() {
     return nullptr;
   std::string_view Data(First, N);
   for (char C : Data)
-    if (!std::isxdigit(C))
+    if (!(C >= '0' && C <= '9') && !(C >= 'a' && C <= 'f'))
       return nullptr;
   First += N;
   if (!consumeIf('E'))
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 589a9066ac57..5bbaa8c208b8 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1834,13 +1834,15 @@ public:
   /// \param BodyGenCB Callback that will generate the region code.
   /// \param FiniCB Callback to finalize variable copies.
   /// \param IsNowait If false, a barrier is emitted.
-  /// \param DidIt Local variable used as a flag to indicate 'single' thread
+  /// \param CPVars copyprivate variables.
+  /// \param CPFuncs copy functions to use for each copyprivate variable.
   ///
   /// \returns The insertion position *after* the single call.
   InsertPointTy createSingle(const LocationDescription &Loc,
                              BodyGenCallbackTy BodyGenCB,
                              FinalizeCallbackTy FiniCB, bool IsNowait,
-                             llvm::Value *DidIt);
+                             ArrayRef<llvm::Value *> CPVars = {},
+                             ArrayRef<llvm::Function *> CPFuncs = {});
 
   /// Generator for '#omp master'
   ///
diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h
index ed25cb96ae96..179305e9260f 100644
--- a/llvm/include/llvm/IR/BasicBlock.h
+++ b/llvm/include/llvm/IR/BasicBlock.h
@@ -93,15 +93,6 @@ public:
   /// if necessary.
   void setIsNewDbgInfoFormat(bool NewFlag);
 
-  /// Validate any DPMarkers / DPValues attached to instructions in this block,
-  /// and block-level stored data too (TrailingDPValues).
-  /// \p Assert Should this method fire an assertion if a problem is found?
-  /// \p Msg Should this method print a message to errs() if a problem is found?
-  /// \p OS Output stream to write errors to.
-  /// \returns True if a problem is found.
-  bool validateDbgValues(bool Assert = true, bool Msg = false,
-                         raw_ostream *OS = nullptr);
-
   /// Record that the collection of DPValues in \p M "trails" after the last
   /// instruction of this block. These are equivalent to dbg.value intrinsics
   /// that exist at the end of a basic block with no terminator (a transient
diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h
index 97089098ee53..2dd546ce709c 100644
--- a/llvm/include/llvm/IR/DebugProgramInstruction.h
+++ b/llvm/include/llvm/IR/DebugProgramInstruction.h
@@ -224,7 +224,7 @@ public:
   // DebugValueUser superclass instead. The referred to Value can either be a
   // ValueAsMetadata or a DIArgList.
 
-  DILocalVariable *Variable;
+  TrackingMDNodeRef Variable;
   DIExpression *Expression;
   DIExpression *AddressExpression;
 
@@ -331,7 +331,7 @@ public:
   void addVariableLocationOps(ArrayRef<Value *> NewValues,
                               DIExpression *NewExpr);
 
-  void setVariable(DILocalVariable *NewVar) { Variable = NewVar; }
+  void setVariable(DILocalVariable *NewVar);
 
   void setExpression(DIExpression *NewExpr) { Expression = NewExpr; }
 
@@ -349,7 +349,8 @@ public:
   void setKillLocation();
   bool isKillLocation() const;
 
-  DILocalVariable *getVariable() const { return Variable; }
+  DILocalVariable *getVariable() const;
+  MDNode *getRawVariable() const { return Variable; }
 
   DIExpression *getExpression() const { return Expression; }
 
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 057dc64e88c2..d6eabc5d2407 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -33,6 +33,7 @@ let TargetPrefix = "spv" in {
   def int_spv_cmpxchg : Intrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_vararg_ty]>;
   def int_spv_unreachable : Intrinsic<[], []>;
   def int_spv_alloca : Intrinsic<[llvm_any_ty], []>;
+  def int_spv_alloca_array : Intrinsic<[llvm_any_ty], [llvm_anyint_ty]>;
   def int_spv_undef : Intrinsic<[llvm_i32_ty], []>;
 
   // Expect, Assume Intrinsics
diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h
index 68a89dc45c28..e41a5940540b 100644
--- a/llvm/include/llvm/IR/Module.h
+++ b/llvm/include/llvm/IR/Module.h
@@ -218,6 +218,11 @@ public:
   /// \ref BasicBlock.
   bool IsNewDbgInfoFormat;
 
+  /// Used when printing this module in the new debug info format; removes all
+  /// declarations of debug intrinsics that are replaced by non-intrinsic
+  /// records in the new format.
+  void removeDebugIntrinsicDeclarations();
+
   /// \see BasicBlock::convertToNewDbgValues.
   void convertToNewDbgValues() {
     for (auto &F : *this) {
@@ -234,6 +239,13 @@ public:
     IsNewDbgInfoFormat = false;
   }
 
+  void setIsNewDbgInfoFormat(bool UseNewFormat) {
+    if (UseNewFormat && !IsNewDbgInfoFormat)
+      convertToNewDbgValues();
+    else if (!UseNewFormat && IsNewDbgInfoFormat)
+      convertFromNewDbgValues();
+  }
+
   /// The Module constructor. Note that there is no default constructor. You
   /// must provide a name for the module upon construction.
   explicit Module(StringRef ModuleID, LLVMContext& C);
diff --git a/llvm/include/llvm/IR/PrintPasses.h b/llvm/include/llvm/IR/PrintPasses.h
index 95b97e76c867..3803bd05cbe5 100644
--- a/llvm/include/llvm/IR/PrintPasses.h
+++ b/llvm/include/llvm/IR/PrintPasses.h
@@ -78,6 +78,25 @@ std::string doSystemDiff(StringRef Before, StringRef After,
                          StringRef OldLineFormat, StringRef NewLineFormat,
                          StringRef UnchangedLineFormat);
 
+/// Used to temporarily set the debug info format of a function, module, or
+/// basic block for the duration of this object's lifetime, after which the
+/// prior state will be restored.
+template <typename T> class ScopedDbgInfoFormatSetter {
+  T &Obj;
+  bool OldState;
+
+public:
+  ScopedDbgInfoFormatSetter(T &Obj, bool NewState)
+      : Obj(Obj), OldState(Obj.IsNewDbgInfoFormat) {
+    Obj.setIsNewDbgInfoFormat(NewState);
+  }
+  ~ScopedDbgInfoFormatSetter() { Obj.setIsNewDbgInfoFormat(OldState); }
+};
+
+template <typename T>
+ScopedDbgInfoFormatSetter(T &Obj, bool NewState)
+    -> ScopedDbgInfoFormatSetter<T>;
+
 } // namespace llvm
 
 #endif // LLVM_IR_PRINTPASSES_H
diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index 1050f24161fb..94996ae89e35 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -75,6 +75,8 @@ void computeLTOCacheKey(
 
 namespace lto {
 
+StringLiteral getThinLTODefaultCPU(const Triple &TheTriple);
+
 /// Given the original \p Path to an output file, replace any path
 /// prefix matching \p OldPrefix with \p NewPrefix. Also, create the
 /// resulting directory if it does not yet exist.
diff --git a/llvm/include/llvm/ObjCopy/ELF/ELFConfig.h b/llvm/include/llvm/ObjCopy/ELF/ELFConfig.h
index d77cb69b159d..eafed92516c7 100644
--- a/llvm/include/llvm/ObjCopy/ELF/ELFConfig.h
+++ b/llvm/include/llvm/ObjCopy/ELF/ELFConfig.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_OBJCOPY_ELF_ELFCONFIG_H
 #define LLVM_OBJCOPY_ELF_ELFCONFIG_H
 
+#include "llvm/ObjCopy/CommonConfig.h"
 #include "llvm/Object/ELFTypes.h"
 
 namespace llvm {
@@ -18,6 +19,8 @@ namespace objcopy {
 struct ELFConfig {
   uint8_t NewSymbolVisibility = (uint8_t)ELF::STV_DEFAULT;
 
+  std::vector<std::pair<NameMatcher, uint8_t>> SymbolsToSetVisibility;
+
   // ELF entry point address expression. The input parameter is an entry point
   // address in the input ELF file. The entry address in the output file is
   // calculated with EntryExpr(input_address), when either --set-start or
diff --git a/llvm/include/llvm/Object/Archive.h b/llvm/include/llvm/Object/Archive.h
index 3dd99a46507a..f71630054dc6 100644
--- a/llvm/include/llvm/Object/Archive.h
+++ b/llvm/include/llvm/Object/Archive.h
@@ -338,7 +338,7 @@ public:
 
   Kind kind() const { return (Kind)Format; }
   bool isThin() const { return IsThin; }
-  static object::Archive::Kind getDefaultKindForHost();
+  static object::Archive::Kind getDefaultKind();
 
   child_iterator child_begin(Error &Err, bool SkipInternal = true) const;
   child_iterator child_end() const;
diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h
index a7f18c799698..b6e3d321da24 100644
--- a/llvm/include/llvm/Object/DXContainer.h
+++ b/llvm/include/llvm/Object/DXContainer.h
@@ -276,7 +276,7 @@ private:
   dxbc::Header Header;
   SmallVector<uint32_t, 4> PartOffsets;
   std::optional<DXILData> DXIL;
-  std::optional<uint64_t> ShaderFlags;
+  std::optional<uint64_t> ShaderFeatureFlags;
   std::optional<dxbc::ShaderHash> Hash;
   std::optional<DirectX::PSVRuntimeInfo> PSVInfo;
   DirectX::Signature InputSignature;
@@ -286,7 +286,7 @@ private:
   Error parseHeader();
   Error parsePartOffsets();
   Error parseDXILHeader(StringRef Part);
-  Error parseShaderFlags(StringRef Part);
+  Error parseShaderFeatureFlags(StringRef Part);
   Error parseHash(StringRef Part);
   Error parsePSVInfo(StringRef Part);
   Error parseSignature(StringRef Part, DirectX::Signature &Array);
@@ -368,7 +368,9 @@ public:
 
   const std::optional<DXILData> &getDXIL() const { return DXIL; }
 
-  std::optional<uint64_t> getShaderFlags() const { return ShaderFlags; }
+  std::optional<uint64_t> getShaderFeatureFlags() const {
+    return ShaderFeatureFlags;
+  }
 
   std::optional<dxbc::ShaderHash> getShaderHash() const { return Hash; }
 
diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
index 66a6ac70bbea..497f82bbd0f3 100644
--- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
+++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
@@ -56,10 +56,10 @@ struct DXILProgram {
   std::optional<std::vector<llvm::yaml::Hex8>> DXIL;
 };
 
-#define SHADER_FLAG(Num, Val, Str) bool Val = false;
-struct ShaderFlags {
-  ShaderFlags() = default;
-  ShaderFlags(uint64_t FlagData);
+#define SHADER_FEATURE_FLAG(Num, Val, Str) bool Val = false;
+struct ShaderFeatureFlags {
+  ShaderFeatureFlags() = default;
+  ShaderFeatureFlags(uint64_t FlagData);
   uint64_t getEncodedFlags();
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 };
@@ -151,7 +151,7 @@ struct Part {
   std::string Name;
   uint32_t Size;
   std::optional<DXILProgram> Program;
-  std::optional<ShaderFlags> Flags;
+  std::optional<ShaderFeatureFlags> Flags;
   std::optional<ShaderHash> Hash;
   std::optional<PSVInfo> Info;
   std::optional<DXContainerYAML::Signature> Signature;
@@ -195,8 +195,8 @@ template <> struct MappingTraits<DXContainerYAML::DXILProgram> {
   static void mapping(IO &IO, DXContainerYAML::DXILProgram &Program);
 };
 
-template <> struct MappingTraits<DXContainerYAML::ShaderFlags> {
-  static void mapping(IO &IO, DXContainerYAML::ShaderFlags &Flags);
+template <> struct MappingTraits<DXContainerYAML::ShaderFeatureFlags> {
+  static void mapping(IO &IO, DXContainerYAML::ShaderFeatureFlags &Flags);
 };
 
 template <> struct MappingTraits<DXContainerYAML::ShaderHash> {
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 82a17e882b3c..00eb9b096a93 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -111,7 +111,8 @@ namespace llvm {
 /// construction.
 template <typename DerivedT> class CodeGenPassBuilder {
 public:
-  explicit CodeGenPassBuilder(LLVMTargetMachine &TM, CGPassBuilderOption Opts,
+  explicit CodeGenPassBuilder(LLVMTargetMachine &TM,
+                              const CGPassBuilderOption &Opts,
                               PassInstrumentationCallbacks *PIC)
       : TM(TM), Opt(Opts), PIC(PIC) {
     // Target could set CGPassBuilderOption::MISchedPostRA to true to achieve
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index b39f262c3d97..7a8b6639f297 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -359,15 +359,19 @@ struct CountedRegion : public CounterMappingRegion {
   uint64_t ExecutionCount;
   uint64_t FalseExecutionCount;
   bool Folded;
+  bool HasSingleByteCoverage;
 
-  CountedRegion(const CounterMappingRegion &R, uint64_t ExecutionCount)
+  CountedRegion(const CounterMappingRegion &R, uint64_t ExecutionCount,
+                bool HasSingleByteCoverage)
       : CounterMappingRegion(R), ExecutionCount(ExecutionCount),
-        FalseExecutionCount(0), Folded(false) {}
+        FalseExecutionCount(0), Folded(false),
+        HasSingleByteCoverage(HasSingleByteCoverage) {}
 
   CountedRegion(const CounterMappingRegion &R, uint64_t ExecutionCount,
-                uint64_t FalseExecutionCount)
+                uint64_t FalseExecutionCount, bool HasSingleByteCoverage)
       : CounterMappingRegion(R), ExecutionCount(ExecutionCount),
-        FalseExecutionCount(FalseExecutionCount), Folded(false) {}
+        FalseExecutionCount(FalseExecutionCount), Folded(false),
+        HasSingleByteCoverage(HasSingleByteCoverage) {}
 };
 
 /// MCDC Record grouping all information together.
@@ -380,8 +384,59 @@ struct MCDCRecord {
   /// are effectively ignored.
   enum CondState { MCDC_DontCare = -1, MCDC_False = 0, MCDC_True = 1 };
 
-  using TestVector = llvm::SmallVector<CondState>;
-  using TestVectors = llvm::SmallVector<TestVector>;
+  /// Emulate SmallVector<CondState> with a pair of BitVector.
+  ///
+  ///          True  False DontCare (Impossible)
+  /// Values:  True  False False    True
+  /// Visited: True  True  False    False
+  class TestVector {
+    BitVector Values;  /// True/False (False when DontCare)
+    BitVector Visited; /// ~DontCare
+
+  public:
+    /// Default values are filled with DontCare.
+    TestVector(unsigned N) : Values(N), Visited(N) {}
+
+    /// Emulate RHS SmallVector::operator[]
+    CondState operator[](int I) const {
+      return (Visited[I] ? (Values[I] ? MCDC_True : MCDC_False)
+                         : MCDC_DontCare);
+    }
+
+    /// Equivalent to buildTestVector's Index.
+    auto getIndex() const { return Values.getData()[0]; }
+
+    /// Set the condition \p Val at position \p I.
+    /// This emulates LHS SmallVector::operator[].
+    void set(int I, CondState Val) {
+      Visited[I] = (Val != MCDC_DontCare);
+      Values[I] = (Val == MCDC_True);
+    }
+
+    /// Emulate SmallVector::push_back.
+    void push_back(CondState Val) {
+      Visited.push_back(Val != MCDC_DontCare);
+      Values.push_back(Val == MCDC_True);
+      assert(Values.size() == Visited.size());
+    }
+
+    /// For each element:
+    /// - False if either is DontCare
+    /// - False if both have the same value
+    /// - True if both have the opposite value
+    /// ((A.Values ^ B.Values) & A.Visited & B.Visited)
+    /// Dedicated to findIndependencePairs().
+    auto getDifferences(const TestVector &B) const {
+      const auto &A = *this;
+      BitVector AB = A.Values;
+      AB ^= B.Values;
+      AB &= A.Visited;
+      AB &= B.Visited;
+      return AB;
+    }
+  };
+
+  using TestVectors = llvm::SmallVector<std::pair<TestVector, CondState>>;
   using BoolVector = llvm::SmallVector<bool>;
   using TVRowPair = std::pair<unsigned, unsigned>;
   using TVPairMap = llvm::DenseMap<unsigned, TVRowPair>;
@@ -422,13 +477,13 @@ public:
   /// accessing conditions in the TestVectors requires a translation from a
   /// ordinal position to actual condition ID. This is done via PosToID[].
   CondState getTVCondition(unsigned TestVectorIndex, unsigned Condition) {
-    return TV[TestVectorIndex][PosToID[Condition]];
+    return TV[TestVectorIndex].first[PosToID[Condition]];
   }
 
   /// Return the Result evaluation for an executed test vector.
   /// See MCDCRecordProcessor::RecordTestVector().
   CondState getTVResult(unsigned TestVectorIndex) {
-    return TV[TestVectorIndex][getNumConditions()];
+    return TV[TestVectorIndex].second;
   }
 
   /// Determine whether a given condition (indicated by Condition) is covered
@@ -661,10 +716,11 @@ struct FunctionRecord {
   }
 
   void pushRegion(CounterMappingRegion Region, uint64_t Count,
-                  uint64_t FalseCount) {
+                  uint64_t FalseCount, bool HasSingleByteCoverage) {
     if (Region.Kind == CounterMappingRegion::BranchRegion ||
         Region.Kind == CounterMappingRegion::MCDCBranchRegion) {
-      CountedBranchRegions.emplace_back(Region, Count, FalseCount);
+      CountedBranchRegions.emplace_back(Region, Count, FalseCount,
+                                        HasSingleByteCoverage);
       // If both counters are hard-coded to zero, then this region represents a
       // constant-folded branch.
       if (Region.Count.isZero() && Region.FalseCount.isZero())
@@ -673,7 +729,8 @@ struct FunctionRecord {
     }
     if (CountedRegions.empty())
       ExecutionCount = Count;
-    CountedRegions.emplace_back(Region, Count, FalseCount);
+    CountedRegions.emplace_back(Region, Count, FalseCount,
+                                HasSingleByteCoverage);
   }
 };
 
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index a928ba6961f3..25ec06a73920 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -831,6 +831,7 @@ private:
   struct ValueProfData {
     std::vector<InstrProfValueSiteRecord> IndirectCallSites;
     std::vector<InstrProfValueSiteRecord> MemOPSizes;
+    std::vector<InstrProfValueSiteRecord> VTableTargets;
   };
   std::unique_ptr<ValueProfData> ValueData;
 
@@ -853,6 +854,8 @@ private:
       return ValueData->IndirectCallSites;
     case IPVK_MemOPSize:
       return ValueData->MemOPSizes;
+    case IPVK_VTableTarget:
+      return ValueData->VTableTargets;
     default:
       llvm_unreachable("Unknown value kind!");
     }
@@ -1036,7 +1039,9 @@ enum ProfVersion {
   Version10 = 10,
   // An additional field is used for bitmap bytes.
   Version11 = 11,
-  // The current version is 11.
+  // VTable profiling,
+  Version12 = 12,
+  // The current version is 12.
   CurrentVersion = INSTR_PROF_INDEX_VERSION
 };
 const uint64_t Version = ProfVersion::CurrentVersion;
@@ -1057,6 +1062,7 @@ struct Header {
   uint64_t MemProfOffset;
   uint64_t BinaryIdOffset;
   uint64_t TemporalProfTracesOffset;
+  uint64_t VTableNamesOffset;
   // New fields should only be added at the end to ensure that the size
   // computation is correct. The methods below need to be updated to ensure that
   // the new field is read correctly.
@@ -1193,8 +1199,13 @@ template <> inline uint64_t getMagic<uint32_t>() {
 // It should also match the synthesized type in
 // Transforms/Instrumentation/InstrProfiling.cpp:getOrCreateRegionCounters.
 template <class IntPtrT> struct alignas(8) ProfileData {
-  #define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Type Name;
-  #include "llvm/ProfileData/InstrProfData.inc"
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Type Name;
+#include "llvm/ProfileData/InstrProfData.inc"
+};
+
+template <class IntPtrT> struct alignas(8) VTableProfileData {
+#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Init) Type Name;
+#include "llvm/ProfileData/InstrProfData.inc"
 };
 
 // File header structure of the LLVM profile data in raw format.
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index fce407f547f3..e9866d94b762 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -96,6 +96,25 @@ INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes, \
 #undef INSTR_PROF_DATA
 /* INSTR_PROF_DATA end. */
 
+/* For a virtual table object, record the name hash to associate profiled
+ * addresses with global variables, and record {starting address, size in bytes}
+ * to map the profiled virtual table (which usually have an offset from the
+ * starting address) back to a virtual table object. */
+#ifndef INSTR_PROF_VTABLE_DATA
+#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Initializer)
+#else
+#define INSTR_PROF_VTABLE_DATA_DEFINED
+#endif
+INSTR_PROF_VTABLE_DATA(const uint64_t, llvm::Type::getInt64Ty(Ctx), \
+                       VTableNameHash, ConstantInt::get(llvm::Type::getInt64Ty(Ctx), \
+                       IndexedInstrProf::ComputeHash(PGOVTableName)))
+INSTR_PROF_VTABLE_DATA(const IntPtrT, llvm::PointerType::getUnqual(Ctx), \
+                       VTablePointer, VTableAddr)
+INSTR_PROF_VTABLE_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), VTableSize, \
+                       ConstantInt::get(llvm::Type::getInt32Ty(Ctx), \
+                                        VTableSizeVal))
+#undef INSTR_PROF_VTABLE_DATA
+/* INSTR_PROF_VTABLE_DATA end. */
 
 /* This is an internal data structure used by value profiler. It
  * is defined here to allow serialization code sharing by LLVM
@@ -147,6 +166,8 @@ INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta,
 INSTR_PROF_RAW_HEADER(uint64_t, BitmapDelta,
                       (uintptr_t)BitmapBegin - (uintptr_t)DataBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
+INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
+INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
 INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 #undef INSTR_PROF_RAW_HEADER
 /* INSTR_PROF_RAW_HEADER  end */
@@ -188,13 +209,26 @@ VALUE_PROF_FUNC_PARAM(uint32_t, CounterIndex, Type::getInt32Ty(Ctx))
 VALUE_PROF_KIND(IPVK_IndirectCallTarget, 0, "indirect call target")
 /* For memory intrinsic functions size profiling. */
 VALUE_PROF_KIND(IPVK_MemOPSize, 1, "memory intrinsic functions size")
+/* For virtual table address profiling, the address point of the virtual table
+ * (i.e., the address contained in objects pointing to a virtual table) are
+ * profiled. Note this may not be the address of the per C++ class virtual table
+ *  object (e.g., there might be an offset).
+ *
+ * The profiled addresses are stored in raw profile, together with the following
+ * two types of information.
+ * 1. The (starting and ending) addresses of per C++ class virtual table objects.
+ * 2. The (compressed) virtual table object names.
+ * RawInstrProfReader converts profiled virtual table addresses to virtual table
+ *  objects' MD5 hash.
+ */
+VALUE_PROF_KIND(IPVK_VTableTarget, 2, "The profiled address point of the vtable")
 /* These two kinds must be the last to be
  * declared. This is to make sure the string
  * array created with the template can be
  * indexed with the kind value.
  */
 VALUE_PROF_KIND(IPVK_First, IPVK_IndirectCallTarget, "first")
-VALUE_PROF_KIND(IPVK_Last, IPVK_MemOPSize, "last")
+VALUE_PROF_KIND(IPVK_Last, IPVK_VTableTarget, "last")
 
 #undef VALUE_PROF_KIND
 /* VALUE_PROF_KIND end */
@@ -284,12 +318,18 @@ INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \
 INSTR_PROF_SECT_ENTRY(IPSK_name, \
                       INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON), \
                       INSTR_PROF_NAME_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_vname, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_VNAME_COMMON), \
+                      INSTR_PROF_VNAME_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_vals, \
                       INSTR_PROF_QUOTE(INSTR_PROF_VALS_COMMON), \
                       INSTR_PROF_VALS_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_vnodes, \
                       INSTR_PROF_QUOTE(INSTR_PROF_VNODES_COMMON), \
                       INSTR_PROF_VNODES_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_vtab, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_VTAB_COMMON), \
+                      INSTR_PROF_VTAB_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_covmap, \
                       INSTR_PROF_QUOTE(INSTR_PROF_COVMAP_COMMON), \
                       INSTR_PROF_COVMAP_COFF, "__LLVM_COV,")
@@ -668,9 +708,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
         (uint64_t)'f' << 16 | (uint64_t)'R' << 8 | (uint64_t)129
 
 /* Raw profile format version (start from 1). */
-#define INSTR_PROF_RAW_VERSION 9
+#define INSTR_PROF_RAW_VERSION 10
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 11
+#define INSTR_PROF_INDEX_VERSION 12
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 6
 
@@ -708,10 +748,12 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
    than WIN32 */
 #define INSTR_PROF_DATA_COMMON __llvm_prf_data
 #define INSTR_PROF_NAME_COMMON __llvm_prf_names
+#define INSTR_PROF_VNAME_COMMON __llvm_prf_vns
 #define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts
 #define INSTR_PROF_BITS_COMMON __llvm_prf_bits
 #define INSTR_PROF_VALS_COMMON __llvm_prf_vals
 #define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds
+#define INSTR_PROF_VTAB_COMMON __llvm_prf_vtab
 #define INSTR_PROF_COVMAP_COMMON __llvm_covmap
 #define INSTR_PROF_COVFUN_COMMON __llvm_covfun
 #define INSTR_PROF_COVDATA_COMMON __llvm_covdata
@@ -722,10 +764,12 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
  */
 #define INSTR_PROF_DATA_COFF ".lprfd$M"
 #define INSTR_PROF_NAME_COFF ".lprfn$M"
+#define INSTR_PROF_VNAME_COFF ".lprfvn$M"
 #define INSTR_PROF_CNTS_COFF ".lprfc$M"
 #define INSTR_PROF_BITS_COFF ".lprfb$M"
 #define INSTR_PROF_VALS_COFF ".lprfv$M"
 #define INSTR_PROF_VNODES_COFF ".lprfnd$M"
+#define INSTR_PROF_VTAB_COFF ".lprfvt$M"
 #define INSTR_PROF_COVMAP_COFF ".lcovmap$M"
 #define INSTR_PROF_COVFUN_COFF ".lcovfun$M"
 /* Since cov data and cov names sections are not allocated, we don't need to
@@ -741,6 +785,8 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_NAME_SECT_NAME INSTR_PROF_NAME_COFF
 #define INSTR_PROF_CNTS_SECT_NAME INSTR_PROF_CNTS_COFF
 #define INSTR_PROF_BITS_SECT_NAME INSTR_PROF_BITS_COFF
+#define INSTR_PROF_VTAB_SECT_NAME INSTR_PROF_VTAB_COFF
+#define INSTR_PROF_VNAME_SECT_NAME INSTR_PROF_VNAME_COFF
 /* Array of pointers. Each pointer points to a list
  * of value nodes associated with one value site.
  */
@@ -758,6 +804,8 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_NAME_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON)
 #define INSTR_PROF_CNTS_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_CNTS_COMMON)
 #define INSTR_PROF_BITS_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_BITS_COMMON)
+#define INSTR_PROF_VTAB_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_VTAB_COMMON)
+#define INSTR_PROF_VNAME_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_VNAME_COMMON)
 /* Array of pointers. Each pointer points to a list
  * of value nodes associated with one value site.
  */
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index 87f15639a2c3..cfde5d3fc77d 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -326,12 +326,16 @@ private:
   uint64_t NamesDelta;
   const RawInstrProf::ProfileData<IntPtrT> *Data;
   const RawInstrProf::ProfileData<IntPtrT> *DataEnd;
+  const RawInstrProf::VTableProfileData<IntPtrT> *VTableBegin = nullptr;
+  const RawInstrProf::VTableProfileData<IntPtrT> *VTableEnd = nullptr;
   const char *CountersStart;
   const char *CountersEnd;
   const char *BitmapStart;
   const char *BitmapEnd;
   const char *NamesStart;
   const char *NamesEnd;
+  const char *VNamesStart = nullptr;
+  const char *VNamesEnd = nullptr;
   // After value profile is all read, this pointer points to
   // the header of next profile data (if exists)
   const uint8_t *ValueDataStart;
@@ -656,6 +660,15 @@ private:
   std::unique_ptr<MemProfRecordHashTable> MemProfRecordTable;
   /// MemProf frame profile data on-disk indexed via frame id.
   std::unique_ptr<MemProfFrameHashTable> MemProfFrameTable;
+  /// VTableNamePtr points to the beginning of compressed vtable names.
+  /// When a symtab is constructed from profiles by llvm-profdata, the list of
+  /// names could be decompressed based on `VTableNamePtr` and
+  /// `CompressedVTableNamesLen`.
+  /// A compiler that reads indexed profiles could construct symtab from module
+  /// IR so it doesn't need the decompressed names.
+  const char *VTableNamePtr = nullptr;
+  /// The length of compressed vtable names.
+  uint64_t CompressedVTableNamesLen = 0;
   /// Total size of binary ids.
   uint64_t BinaryIdsSize{0};
   /// Start address of binary id length and data pairs.
diff --git a/llvm/include/llvm/ProfileData/InstrProfWriter.h b/llvm/include/llvm/ProfileData/InstrProfWriter.h
index 047b14f223bd..7a806fd7fcf3 100644
--- a/llvm/include/llvm/ProfileData/InstrProfWriter.h
+++ b/llvm/include/llvm/ProfileData/InstrProfWriter.h
@@ -168,6 +168,10 @@ public:
 
   InstrProfKind getProfileKind() const { return ProfileKind; }
 
+  bool hasSingleByteCoverage() const {
+    return static_cast<bool>(ProfileKind & InstrProfKind::SingleByteCoverage);
+  }
+
   // Internal interface for testing purpose only.
   void setValueProfDataEndianness(llvm::endianness Endianness);
   void setOutputSparse(bool Sparse);
diff --git a/llvm/include/llvm/Support/BlockFrequency.h b/llvm/include/llvm/Support/BlockFrequency.h
index 8b172ee486aa..aeab99615a95 100644
--- a/llvm/include/llvm/Support/BlockFrequency.h
+++ b/llvm/include/llvm/Support/BlockFrequency.h
@@ -19,6 +19,7 @@
 
 namespace llvm {
 
+class raw_ostream;
 class BranchProbability;
 
 // This class represents Block Frequency as a 64-bit value.
@@ -119,6 +120,9 @@ public:
   }
 };
 
+void printRelativeBlockFreq(raw_ostream &OS, BlockFrequency EntryFreq,
+                            BlockFrequency Freq);
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h
index 1b793b0eccf3..68dbe1ea3062 100644
--- a/llvm/include/llvm/Support/TypeSize.h
+++ b/llvm/include/llvm/Support/TypeSize.h
@@ -321,8 +321,6 @@ class TypeSize : public details::FixedOrScalableQuantity<TypeSize, uint64_t> {
       : FixedOrScalableQuantity(V) {}
 
 public:
-  constexpr TypeSize() : FixedOrScalableQuantity(0, false) {}
-
   constexpr TypeSize(ScalarTy Quantity, bool Scalable)
       : FixedOrScalableQuantity(Quantity, Scalable) {}
 
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 7376ac98a2b0..93e9ed46642d 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -116,7 +116,7 @@ enum ArchExtKind : unsigned {
   AEK_PROFILE =       7,  // FEAT_SPE
   AEK_RAS =           8,  // FEAT_RAS, FEAT_RASv1p1
   AEK_LSE =           9,  // FEAT_LSE
-  AEK_SVE =           10,  // FEAT_SVE
+  AEK_SVE =           10, // FEAT_SVE
   AEK_DOTPROD =       11, // FEAT_DotProd
   AEK_RCPC =          12, // FEAT_LRCPC
   AEK_RDM =           13, // FEAT_RDM
diff --git a/llvm/lib/Analysis/BlockFrequencyInfo.cpp b/llvm/lib/Analysis/BlockFrequencyInfo.cpp
index 96c9bfa0e372..ebad8388cbe4 100644
--- a/llvm/lib/Analysis/BlockFrequencyInfo.cpp
+++ b/llvm/lib/Analysis/BlockFrequencyInfo.cpp
@@ -284,7 +284,7 @@ void BlockFrequencyInfo::verifyMatch(BlockFrequencyInfo &Other) const {
 Printable llvm::printBlockFreq(const BlockFrequencyInfo &BFI,
                                BlockFrequency Freq) {
   return Printable([&BFI, Freq](raw_ostream &OS) {
-    printBlockFreqImpl(OS, BFI.getEntryFreq(), Freq);
+    printRelativeBlockFreq(OS, BFI.getEntryFreq(), Freq);
   });
 }
 
diff --git a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
index ae08d56ef098..9f6e53ba15b6 100644
--- a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -634,21 +634,6 @@ BlockFrequencyInfoImplBase::getLoopName(const LoopData &Loop) const {
   return getBlockName(Loop.getHeader()) + (Loop.isIrreducible() ? "**" : "*");
 }
 
-void llvm::printBlockFreqImpl(raw_ostream &OS, BlockFrequency EntryFreq,
-                              BlockFrequency Freq) {
-  if (Freq == BlockFrequency(0)) {
-    OS << "0";
-    return;
-  }
-  if (EntryFreq == BlockFrequency(0)) {
-    OS << "<invalid BFI>";
-    return;
-  }
-  Scaled64 Block(Freq.getFrequency(), 0);
-  Scaled64 Entry(EntryFreq.getFrequency(), 0);
-  OS << Block / Entry;
-}
-
 void IrreducibleGraph::addNodesInLoop(const BFIBase::LoopData &OuterLoop) {
   Start = OuterLoop.getHeader();
   Nodes.reserve(OuterLoop.Nodes.size());
diff --git a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
index 23fc9b2e0410..9e1727a0b8d1 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
@@ -38,7 +38,9 @@ void AccelTableBase::computeBucketCount() {
   for (const auto &E : Entries)
     Uniques.push_back(E.second.HashValue);
 
-  BucketCount = llvm::dwarf::getDebugNamesBucketCount(Uniques, UniqueHashCount);
+  auto Counts = llvm::dwarf::getDebugNamesBucketAndHashCount(Uniques);
+  BucketCount = Counts.first;
+  UniqueHashCount = Counts.second;
 }
 
 void AccelTableBase::finalize(AsmPrinter *Asm, StringRef Prefix) {
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 38bb808dd5bd..7c986dbbc2c7 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -596,8 +596,6 @@ bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) {
   const Value *CondVal = BrInst.getCondition();
   MachineBasicBlock *Succ1MBB = &getMBB(*BrInst.getSuccessor(1));
 
-  const auto &TLI = *MF->getSubtarget().getTargetLowering();
-
   // If this is a series of conditions that are or'd or and'd together, emit
   // this as a sequence of branches instead of setcc's with and/or operations.
   // As long as jumps are not expensive (exceptions for multi-use logic ops,
@@ -617,7 +615,7 @@ bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) {
   //     jle foo
   using namespace PatternMatch;
   const Instruction *CondI = dyn_cast<Instruction>(CondVal);
-  if (!TLI.isJumpExpensive() && CondI && CondI->hasOneUse() &&
+  if (!TLI->isJumpExpensive() && CondI && CondI->hasOneUse() &&
       !BrInst.hasMetadata(LLVMContext::MD_unpredictable)) {
     Instruction::BinaryOps Opcode = (Instruction::BinaryOps)0;
     Value *Vec;
@@ -1385,9 +1383,8 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
     return true;
   }
 
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
   MachineMemOperand::Flags Flags =
-      TLI.getLoadMemOperandFlags(LI, *DL, AC, LibInfo);
+      TLI->getLoadMemOperandFlags(LI, *DL, AC, LibInfo);
   if (AA && !(Flags & MachineMemOperand::MOInvariant)) {
     if (AA->pointsToConstantMemory(
             MemoryLocation(Ptr, LocationSize::precise(StoreSize), AAInfo))) {
@@ -1434,8 +1431,7 @@ bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) {
     return true;
   }
 
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
-  MachineMemOperand::Flags Flags = TLI.getStoreMemOperandFlags(SI, *DL);
+  MachineMemOperand::Flags Flags = TLI->getStoreMemOperandFlags(SI, *DL);
 
   for (unsigned i = 0; i < Vals.size(); ++i) {
     Register Addr;
@@ -1779,8 +1775,7 @@ void IRTranslator::getStackGuard(Register DstReg,
   auto MIB =
       MIRBuilder.buildInstr(TargetOpcode::LOAD_STACK_GUARD, {DstReg}, {});
 
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
-  Value *Global = TLI.getSDagStackGuard(*MF->getFunction().getParent());
+  Value *Global = TLI->getSDagStackGuard(*MF->getFunction().getParent());
   if (!Global)
     return;
 
@@ -2111,9 +2106,8 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     // does. Simplest intrinsic ever!
     return true;
   case Intrinsic::vastart: {
-    auto &TLI = *MF->getSubtarget().getTargetLowering();
     Value *Ptr = CI.getArgOperand(0);
-    unsigned ListSize = TLI.getVaListSizeInBits(*DL) / 8;
+    unsigned ListSize = TLI->getVaListSizeInBits(*DL) / 8;
     Align Alignment = getKnownAlignment(Ptr, *DL);
 
     MIRBuilder.buildInstr(TargetOpcode::G_VASTART, {}, {getOrCreateVReg(*Ptr)})
@@ -2189,14 +2183,13 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     return translateFixedPointIntrinsic(TargetOpcode::G_UDIVFIXSAT, CI, MIRBuilder);
   case Intrinsic::fmuladd: {
     const TargetMachine &TM = MF->getTarget();
-    const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
     Register Dst = getOrCreateVReg(CI);
     Register Op0 = getOrCreateVReg(*CI.getArgOperand(0));
     Register Op1 = getOrCreateVReg(*CI.getArgOperand(1));
     Register Op2 = getOrCreateVReg(*CI.getArgOperand(2));
     if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
-        TLI.isFMAFasterThanFMulAndFAdd(*MF,
-                                       TLI.getValueType(*DL, CI.getType()))) {
+        TLI->isFMAFasterThanFMulAndFAdd(*MF,
+                                        TLI->getValueType(*DL, CI.getType()))) {
       // TODO: Revisit this to see if we should move this part of the
       // lowering to the combiner.
       MIRBuilder.buildFMA(Dst, Op0, Op1, Op2,
@@ -2254,10 +2247,9 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     getStackGuard(getOrCreateVReg(CI), MIRBuilder);
     return true;
   case Intrinsic::stackprotector: {
-    const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
     LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL);
     Register GuardVal;
-    if (TLI.useLoadStackGuardNode()) {
+    if (TLI->useLoadStackGuardNode()) {
       GuardVal = MRI->createGenericVirtualRegister(PtrTy);
       getStackGuard(GuardVal, MIRBuilder);
     } else
@@ -2635,10 +2627,9 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   }
 
   // Add a MachineMemOperand if it is a target mem intrinsic.
-  const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
   TargetLowering::IntrinsicInfo Info;
   // TODO: Add a GlobalISel version of getTgtMemIntrinsic.
-  if (TLI.getTgtMemIntrinsic(Info, CI, *MF, ID)) {
+  if (TLI->getTgtMemIntrinsic(Info, CI, *MF, ID)) {
     Align Alignment = Info.align.value_or(
         DL->getABITypeAlign(Info.memVT.getTypeForEVT(F->getContext())));
     LLT MemTy = Info.memVT.isSimple()
@@ -2818,10 +2809,9 @@ bool IRTranslator::translateLandingPad(const User &U,
 
   // If there aren't registers to copy the values into (e.g., during SjLj
   // exceptions), then don't bother.
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
   const Constant *PersonalityFn = MF->getFunction().getPersonalityFn();
-  if (TLI.getExceptionPointerRegister(PersonalityFn) == 0 &&
-      TLI.getExceptionSelectorRegister(PersonalityFn) == 0)
+  if (TLI->getExceptionPointerRegister(PersonalityFn) == 0 &&
+      TLI->getExceptionSelectorRegister(PersonalityFn) == 0)
     return true;
 
   // If landingpad's return type is token type, we don't create DAG nodes
@@ -2852,7 +2842,7 @@ bool IRTranslator::translateLandingPad(const User &U,
   assert(Tys.size() == 2 && "Only two-valued landingpads are supported");
 
   // Mark exception register as live in.
-  Register ExceptionReg = TLI.getExceptionPointerRegister(PersonalityFn);
+  Register ExceptionReg = TLI->getExceptionPointerRegister(PersonalityFn);
   if (!ExceptionReg)
     return false;
 
@@ -2860,7 +2850,7 @@ bool IRTranslator::translateLandingPad(const User &U,
   ArrayRef<Register> ResRegs = getOrCreateVRegs(LP);
   MIRBuilder.buildCopy(ResRegs[0], ExceptionReg);
 
-  Register SelectorReg = TLI.getExceptionSelectorRegister(PersonalityFn);
+  Register SelectorReg = TLI->getExceptionSelectorRegister(PersonalityFn);
   if (!SelectorReg)
     return false;
 
@@ -2986,8 +2976,7 @@ bool IRTranslator::translateExtractElement(const User &U,
 
   Register Res = getOrCreateVReg(U);
   Register Val = getOrCreateVReg(*U.getOperand(0));
-  const auto &TLI = *MF->getSubtarget().getTargetLowering();
-  unsigned PreferredVecIdxWidth = TLI.getVectorIdxTy(*DL).getSizeInBits();
+  unsigned PreferredVecIdxWidth = TLI->getVectorIdxTy(*DL).getSizeInBits();
   Register Idx;
   if (auto *CI = dyn_cast<ConstantInt>(U.getOperand(1))) {
     if (CI->getBitWidth() != PreferredVecIdxWidth) {
@@ -3039,8 +3028,7 @@ bool IRTranslator::translateAtomicCmpXchg(const User &U,
                                           MachineIRBuilder &MIRBuilder) {
   const AtomicCmpXchgInst &I = cast<AtomicCmpXchgInst>(U);
 
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
-  auto Flags = TLI.getAtomicMemOperandFlags(I, *DL);
+  auto Flags = TLI->getAtomicMemOperandFlags(I, *DL);
 
   auto Res = getOrCreateVRegs(I);
   Register OldValRes = Res[0];
@@ -3061,8 +3049,7 @@ bool IRTranslator::translateAtomicCmpXchg(const User &U,
 bool IRTranslator::translateAtomicRMW(const User &U,
                                       MachineIRBuilder &MIRBuilder) {
   const AtomicRMWInst &I = cast<AtomicRMWInst>(U);
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
-  auto Flags = TLI.getAtomicMemOperandFlags(I, *DL);
+  auto Flags = TLI->getAtomicMemOperandFlags(I, *DL);
 
   Register Res = getOrCreateVReg(I);
   Register Addr = getOrCreateVReg(*I.getPointerOperand());
@@ -3302,8 +3289,7 @@ bool IRTranslator::translate(const Instruction &Inst) {
   CurBuilder->setDebugLoc(Inst.getDebugLoc());
   CurBuilder->setPCSections(Inst.getMetadata(LLVMContext::MD_pcsections));
 
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
-  if (TLI.fallBackToDAGISel(Inst))
+  if (TLI->fallBackToDAGISel(Inst))
     return false;
 
   switch (Inst.getOpcode()) {
@@ -3454,9 +3440,8 @@ bool IRTranslator::finalizeBasicBlock(const BasicBlock &BB,
   // Check if we need to generate stack-protector guard checks.
   StackProtector &SP = getAnalysis<StackProtector>();
   if (SP.shouldEmitSDCheck(BB)) {
-    const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
     bool FunctionBasedInstrumentation =
-        TLI.getSSPStackGuardCheck(*MF->getFunction().getParent());
+        TLI->getSSPStackGuardCheck(*MF->getFunction().getParent());
     SPDescriptor.initialize(&BB, &MBB, FunctionBasedInstrumentation);
   }
   // Handle stack protector.
@@ -3501,10 +3486,9 @@ bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
                                           MachineBasicBlock *ParentBB) {
   CurBuilder->setInsertPt(*ParentBB, ParentBB->end());
   // First create the loads to the guard/stack slot for the comparison.
-  const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
   Type *PtrIRTy = PointerType::getUnqual(MF->getFunction().getContext());
   const LLT PtrTy = getLLTForType(*PtrIRTy, *DL);
-  LLT PtrMemTy = getLLTForMVT(TLI.getPointerMemTy(*DL));
+  LLT PtrMemTy = getLLTForMVT(TLI->getPointerMemTy(*DL));
 
   MachineFrameInfo &MFI = ParentBB->getParent()->getFrameInfo();
   int FI = MFI.getStackProtectorIndex();
@@ -3522,13 +3506,13 @@ bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
                       MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile)
           .getReg(0);
 
-  if (TLI.useStackGuardXorFP()) {
+  if (TLI->useStackGuardXorFP()) {
     LLVM_DEBUG(dbgs() << "Stack protector xor'ing with FP not yet implemented");
     return false;
   }
 
   // Retrieve guard check function, nullptr if instrumentation is inlined.
-  if (const Function *GuardCheckFn = TLI.getSSPStackGuardCheck(M)) {
+  if (const Function *GuardCheckFn = TLI->getSSPStackGuardCheck(M)) {
     // This path is currently untestable on GlobalISel, since the only platform
     // that needs this seems to be Windows, and we fall back on that currently.
     // The code still lives here in case that changes.
@@ -3563,13 +3547,13 @@ bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
 
   // If useLoadStackGuardNode returns true, generate LOAD_STACK_GUARD.
   // Otherwise, emit a volatile load to retrieve the stack guard value.
-  if (TLI.useLoadStackGuardNode()) {
+  if (TLI->useLoadStackGuardNode()) {
     Guard =
         MRI->createGenericVirtualRegister(LLT::scalar(PtrTy.getSizeInBits()));
     getStackGuard(Guard, *CurBuilder);
   } else {
     // TODO: test using android subtarget when we support @llvm.thread.pointer.
-    const Value *IRGuard = TLI.getSDagStackGuard(M);
+    const Value *IRGuard = TLI->getSDagStackGuard(M);
     Register GuardPtr = getOrCreateVReg(*IRGuard);
 
     Guard = CurBuilder
@@ -3593,13 +3577,12 @@ bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
 bool IRTranslator::emitSPDescriptorFailure(StackProtectorDescriptor &SPD,
                                            MachineBasicBlock *FailureBB) {
   CurBuilder->setInsertPt(*FailureBB, FailureBB->end());
-  const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
 
   const RTLIB::Libcall Libcall = RTLIB::STACKPROTECTOR_CHECK_FAIL;
-  const char *Name = TLI.getLibcallName(Libcall);
+  const char *Name = TLI->getLibcallName(Libcall);
 
   CallLowering::CallLoweringInfo Info;
-  Info.CallConv = TLI.getLibcallCallingConv(Libcall);
+  Info.CallConv = TLI->getLibcallCallingConv(Libcall);
   Info.Callee = MachineOperand::CreateES(Name);
   Info.OrigRet = {Register(), Type::getVoidTy(MF->getFunction().getContext()),
                   0};
@@ -3662,6 +3645,7 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   bool EnableCSE = EnableCSEInIRTranslator.getNumOccurrences()
                        ? EnableCSEInIRTranslator
                        : TPC->isGISelCSEEnabled();
+  TLI = MF->getSubtarget().getTargetLowering();
 
   if (EnableCSE) {
     EntryBuilder = std::make_unique<CSEMIRBuilder>(CurMF);
@@ -3696,12 +3680,8 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   FuncInfo.CanLowerReturn = CLI->checkReturnTypeForCallConv(*MF);
 
-  const auto &TLI = *MF->getSubtarget().getTargetLowering();
-
   SL = std::make_unique<GISelSwitchLowering>(this, FuncInfo);
-  SL->init(TLI, TM, *DL);
-
-
+  SL->init(*TLI, TM, *DL);
 
   assert(PendingPHIs.empty() && "stale PHIs");
 
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 30f12bf5cca5..8079f853aef8 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -5216,6 +5216,43 @@ LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
   return Legalized;
 }
 
+MachineInstrBuilder LegalizerHelper::getNeutralElementForVecReduce(
+    unsigned Opcode, MachineIRBuilder &MIRBuilder, LLT Ty) {
+  assert(Ty.isScalar() && "Expected scalar type to make neutral element for");
+
+  switch (Opcode) {
+  default:
+    llvm_unreachable(
+        "getNeutralElementForVecReduce called with invalid opcode!");
+  case TargetOpcode::G_VECREDUCE_ADD:
+  case TargetOpcode::G_VECREDUCE_OR:
+  case TargetOpcode::G_VECREDUCE_XOR:
+  case TargetOpcode::G_VECREDUCE_UMAX:
+    return MIRBuilder.buildConstant(Ty, 0);
+  case TargetOpcode::G_VECREDUCE_MUL:
+    return MIRBuilder.buildConstant(Ty, 1);
+  case TargetOpcode::G_VECREDUCE_AND:
+  case TargetOpcode::G_VECREDUCE_UMIN:
+    return MIRBuilder.buildConstant(
+        Ty, APInt::getAllOnes(Ty.getScalarSizeInBits()));
+  case TargetOpcode::G_VECREDUCE_SMAX:
+    return MIRBuilder.buildConstant(
+        Ty, APInt::getSignedMinValue(Ty.getSizeInBits()));
+  case TargetOpcode::G_VECREDUCE_SMIN:
+    return MIRBuilder.buildConstant(
+        Ty, APInt::getSignedMaxValue(Ty.getSizeInBits()));
+  case TargetOpcode::G_VECREDUCE_FADD:
+    return MIRBuilder.buildFConstant(Ty, -0.0);
+  case TargetOpcode::G_VECREDUCE_FMUL:
+    return MIRBuilder.buildFConstant(Ty, 1.0);
+  case TargetOpcode::G_VECREDUCE_FMINIMUM:
+  case TargetOpcode::G_VECREDUCE_FMAXIMUM:
+    assert(false && "getNeutralElementForVecReduce unimplemented for "
+                    "G_VECREDUCE_FMINIMUM and G_VECREDUCE_FMAXIMUM!");
+  }
+  llvm_unreachable("switch expected to return!");
+}
+
 LegalizerHelper::LegalizeResult
 LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
                                     LLT MoreTy) {
@@ -5420,6 +5457,37 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
     Observer.changedInstr(MI);
     return Legalized;
   }
+  case TargetOpcode::G_VECREDUCE_FADD:
+  case TargetOpcode::G_VECREDUCE_FMUL:
+  case TargetOpcode::G_VECREDUCE_ADD:
+  case TargetOpcode::G_VECREDUCE_MUL:
+  case TargetOpcode::G_VECREDUCE_AND:
+  case TargetOpcode::G_VECREDUCE_OR:
+  case TargetOpcode::G_VECREDUCE_XOR:
+  case TargetOpcode::G_VECREDUCE_SMAX:
+  case TargetOpcode::G_VECREDUCE_SMIN:
+  case TargetOpcode::G_VECREDUCE_UMAX:
+  case TargetOpcode::G_VECREDUCE_UMIN: {
+    LLT OrigTy = MRI.getType(MI.getOperand(1).getReg());
+    MachineOperand &MO = MI.getOperand(1);
+    auto NewVec = MIRBuilder.buildPadVectorWithUndefElements(MoreTy, MO);
+    auto NeutralElement = getNeutralElementForVecReduce(
+        MI.getOpcode(), MIRBuilder, MoreTy.getElementType());
+
+    LLT IdxTy(TLI.getVectorIdxTy(MIRBuilder.getDataLayout()));
+    for (size_t i = OrigTy.getNumElements(), e = MoreTy.getNumElements();
+         i != e; i++) {
+      auto Idx = MIRBuilder.buildConstant(IdxTy, i);
+      NewVec = MIRBuilder.buildInsertVectorElement(MoreTy, NewVec,
+                                                   NeutralElement, Idx);
+    }
+
+    Observer.changingInstr(MI);
+    MO.setReg(NewVec.getReg(0));
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+
   default:
     return UnableToLegalize;
   }
diff --git a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
index 7ee72e214426..cbebdd87398e 100644
--- a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
+++ b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -280,7 +280,7 @@ BlockFrequency MachineBlockFrequencyInfo::getEntryFreq() const {
 Printable llvm::printBlockFreq(const MachineBlockFrequencyInfo &MBFI,
                                BlockFrequency Freq) {
   return Printable([&MBFI, Freq](raw_ostream &OS) {
-    printBlockFreqImpl(OS, MBFI.getEntryFreq(), Freq);
+    printRelativeBlockFreq(OS, MBFI.getEntryFreq(), Freq);
   });
 }
 
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index efc19f8fdbf8..997f6eb08512 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -1264,13 +1264,24 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI,
 
   // If we have a COPY with other uses in the loop, hoist to allow the users to
   // also be hoisted.
+  Register DefReg;
   if (MI.isCopy() && MI.getOperand(0).isReg() &&
-      MI.getOperand(0).getReg().isVirtual() && MI.getOperand(1).isReg() &&
-      MI.getOperand(1).getReg().isVirtual() &&
+      (DefReg = MI.getOperand(0).getReg()).isVirtual() &&
+      MI.getOperand(1).isReg() && MI.getOperand(1).getReg().isVirtual() &&
       IsLoopInvariantInst(MI, CurLoop) &&
       any_of(MRI->use_nodbg_instructions(MI.getOperand(0).getReg()),
-             [&CurLoop](MachineInstr &UseMI) {
-               return CurLoop->contains(&UseMI);
+             [&CurLoop, this, DefReg, Cost](MachineInstr &UseMI) {
+               if (!CurLoop->contains(&UseMI))
+                 return false;
+
+               // COPY is a cheap instruction, but if moving it won't cause high
+               // RP we're fine to hoist it even if the user can't be hoisted
+               // later Otherwise we want to check the user if it's hoistable
+               if (CanCauseHighRegPressure(Cost, false) &&
+                   !CurLoop->isLoopInvariant(UseMI, DefReg))
+                 return false;
+
+               return true;
              }))
     return true;
 
diff --git a/llvm/lib/CodeGen/MachineLoopInfo.cpp b/llvm/lib/CodeGen/MachineLoopInfo.cpp
index bdbc57099aa8..1492c8c366fb 100644
--- a/llvm/lib/CodeGen/MachineLoopInfo.cpp
+++ b/llvm/lib/CodeGen/MachineLoopInfo.cpp
@@ -198,7 +198,8 @@ MDNode *MachineLoop::getLoopID() const {
   return LoopID;
 }
 
-bool MachineLoop::isLoopInvariant(MachineInstr &I) const {
+bool MachineLoop::isLoopInvariant(MachineInstr &I,
+                                  const Register ExcludeReg) const {
   MachineFunction *MF = I.getParent()->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
   const TargetSubtargetInfo &ST = MF->getSubtarget();
@@ -213,6 +214,9 @@ bool MachineLoop::isLoopInvariant(MachineInstr &I) const {
     Register Reg = MO.getReg();
     if (Reg == 0) continue;
 
+    if (ExcludeReg == Reg)
+      continue;
+
     // An instruction that uses or defines a physical register can't e.g. be
     // hoisted, so mark this as not invariant.
     if (Reg.isPhysical()) {
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 750f739a5599..3bbd126bdaf1 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -81,6 +81,22 @@ cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden,
                            cl::desc("Force top-down list scheduling"));
 cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden,
                             cl::desc("Force bottom-up list scheduling"));
+namespace MISchedPostRASched {
+enum Direction {
+  TopDown,
+  BottomUp,
+};
+} // end namespace MISchedPostRASched
+cl::opt<MISchedPostRASched::Direction> PostRADirection(
+    "misched-postra-direction", cl::Hidden,
+    cl::desc("Post reg-alloc list scheduling direction"),
+    // Default to top-down because it was implemented first and existing targets
+    // expect that behavior by default.
+    cl::init(MISchedPostRASched::TopDown),
+    cl::values(clEnumValN(MISchedPostRASched::TopDown, "topdown",
+                          "Force top-down post reg-alloc list scheduling"),
+               clEnumValN(MISchedPostRASched::BottomUp, "bottomup",
+                          "Force bottom-up post reg-alloc list scheduling")));
 cl::opt<bool>
 DumpCriticalPathLength("misched-dcpl", cl::Hidden,
                        cl::desc("Print critical path length to stdout"));
@@ -440,6 +456,14 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   // Instantiate the selected scheduler for this target, function, and
   // optimization level.
   std::unique_ptr<ScheduleDAGInstrs> Scheduler(createMachineScheduler());
+  ScheduleDAGMI::DumpDirection D;
+  if (ForceTopDown)
+    D = ScheduleDAGMI::DumpDirection::TopDown;
+  else if (ForceBottomUp)
+    D = ScheduleDAGMI::DumpDirection::BottomUp;
+  else
+    D = ScheduleDAGMI::DumpDirection::Bidirectional;
+  Scheduler->setDumpDirection(D);
   scheduleRegions(*Scheduler, false);
 
   LLVM_DEBUG(LIS->dump());
@@ -473,6 +497,12 @@ bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   // Instantiate the selected scheduler for this target, function, and
   // optimization level.
   std::unique_ptr<ScheduleDAGInstrs> Scheduler(createPostMachineScheduler());
+  ScheduleDAGMI::DumpDirection D;
+  if (PostRADirection == MISchedPostRASched::TopDown)
+    D = ScheduleDAGMI::DumpDirection::TopDown;
+  else
+    D = ScheduleDAGMI::DumpDirection::BottomUp;
+  Scheduler->setDumpDirection(D);
   scheduleRegions(*Scheduler, true);
 
   if (VerifyScheduling)
@@ -1125,12 +1155,14 @@ LLVM_DUMP_METHOD void ScheduleDAGMI::dumpScheduleTraceBottomUp() const {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void ScheduleDAGMI::dumpSchedule() const {
   if (MISchedDumpScheduleTrace) {
-    if (ForceTopDown)
+    if (DumpDir == DumpDirection::TopDown)
       dumpScheduleTraceTopDown();
-    else if (ForceBottomUp)
+    else if (DumpDir == DumpDirection::BottomUp)
       dumpScheduleTraceBottomUp();
-    else {
+    else if (DumpDir == DumpDirection::Bidirectional) {
       dbgs() << "* Schedule table (Bidirectional): not implemented\n";
+    } else {
+      dbgs() << "* Schedule table: DumpDirection not set.\n";
     }
   }
 
@@ -3832,7 +3864,7 @@ void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) {
 
   Rem.init(DAG, SchedModel);
   Top.init(DAG, SchedModel, &Rem);
-  BotRoots.clear();
+  Bot.init(DAG, SchedModel, &Rem);
 
   // Initialize the HazardRecognizers. If itineraries don't exist, are empty,
   // or are disabled, then these HazardRecs will be disabled.
@@ -3842,13 +3874,30 @@ void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) {
         DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
             Itin, DAG);
   }
+  if (!Bot.HazardRec) {
+    Bot.HazardRec =
+        DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
+            Itin, DAG);
+  }
+}
+
+void PostGenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
+                                      MachineBasicBlock::iterator End,
+                                      unsigned NumRegionInstrs) {
+  if (PostRADirection == MISchedPostRASched::TopDown) {
+    RegionPolicy.OnlyTopDown = true;
+    RegionPolicy.OnlyBottomUp = false;
+  } else if (PostRADirection == MISchedPostRASched::BottomUp) {
+    RegionPolicy.OnlyTopDown = false;
+    RegionPolicy.OnlyBottomUp = true;
+  }
 }
 
 void PostGenericScheduler::registerRoots() {
   Rem.CriticalPath = DAG->ExitSU.getDepth();
 
   // Some roots may not feed into ExitSU. Check all of them in case.
-  for (const SUnit *SU : BotRoots) {
+  for (const SUnit *SU : Bot.Available) {
     if (SU->getDepth() > Rem.CriticalPath)
       Rem.CriticalPath = SU->getDepth();
   }
@@ -3905,12 +3954,13 @@ bool PostGenericScheduler::tryCandidate(SchedCandidate &Cand,
   return false;
 }
 
-void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) {
-  ReadyQueue &Q = Top.Available;
+void PostGenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
+                                             SchedCandidate &Cand) {
+  ReadyQueue &Q = Zone.Available;
   for (SUnit *SU : Q) {
     SchedCandidate TryCand(Cand.Policy);
     TryCand.SU = SU;
-    TryCand.AtTop = true;
+    TryCand.AtTop = Zone.isTop();
     TryCand.initResourceDelta(DAG, SchedModel);
     if (tryCandidate(Cand, TryCand)) {
       Cand.setBest(TryCand);
@@ -3922,29 +3972,54 @@ void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) {
 /// Pick the next node to schedule.
 SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {
   if (DAG->top() == DAG->bottom()) {
-    assert(Top.Available.empty() && Top.Pending.empty() && "ReadyQ garbage");
+    assert(Top.Available.empty() && Top.Pending.empty() &&
+           Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
     return nullptr;
   }
   SUnit *SU;
   do {
-    SU = Top.pickOnlyChoice();
-    if (SU) {
-      tracePick(Only1, true);
+    if (RegionPolicy.OnlyBottomUp) {
+      assert(!RegionPolicy.OnlyTopDown);
+      SU = Bot.pickOnlyChoice();
+      if (SU) {
+        tracePick(Only1, true);
+      } else {
+        CandPolicy NoPolicy;
+        SchedCandidate BotCand(NoPolicy);
+        // Set the bottom-up policy based on the state of the current bottom
+        // zone and the instructions outside the zone, including the top zone.
+        setPolicy(BotCand.Policy, /*IsPostRA=*/true, Bot, nullptr);
+        pickNodeFromQueue(Bot, BotCand);
+        assert(BotCand.Reason != NoCand && "failed to find a candidate");
+        tracePick(BotCand);
+        SU = BotCand.SU;
+      }
+      IsTopNode = false;
     } else {
-      CandPolicy NoPolicy;
-      SchedCandidate TopCand(NoPolicy);
-      // Set the top-down policy based on the state of the current top zone and
-      // the instructions outside the zone, including the bottom zone.
-      setPolicy(TopCand.Policy, /*IsPostRA=*/true, Top, nullptr);
-      pickNodeFromQueue(TopCand);
-      assert(TopCand.Reason != NoCand && "failed to find a candidate");
-      tracePick(TopCand);
-      SU = TopCand.SU;
+
+      assert(RegionPolicy.OnlyTopDown);
+      SU = Top.pickOnlyChoice();
+      if (SU) {
+        tracePick(Only1, true);
+      } else {
+        CandPolicy NoPolicy;
+        SchedCandidate TopCand(NoPolicy);
+        // Set the top-down policy based on the state of the current top zone
+        // and the instructions outside the zone, including the bottom zone.
+        setPolicy(TopCand.Policy, /*IsPostRA=*/true, Top, nullptr);
+        pickNodeFromQueue(Top, TopCand);
+        assert(TopCand.Reason != NoCand && "failed to find a candidate");
+        tracePick(TopCand);
+        SU = TopCand.SU;
+      }
+      IsTopNode = true;
     }
   } while (SU->isScheduled);
 
-  IsTopNode = true;
-  Top.removeReady(SU);
+  if (SU->isTopReady())
+    Top.removeReady(SU);
+  if (SU->isBottomReady())
+    Bot.removeReady(SU);
 
   LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
                     << *SU->getInstr());
@@ -3954,8 +4029,13 @@ SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {
 /// Called after ScheduleDAGMI has scheduled an instruction and updated
 /// scheduled/remaining flags in the DAG nodes.
 void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
-  SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
-  Top.bumpNode(SU);
+  if (IsTopNode) {
+    SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
+    Top.bumpNode(SU);
+  } else {
+    SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle());
+    Bot.bumpNode(SU);
+  }
 }
 
 ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6a28bc8da223..33ada3655dc7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -27835,7 +27835,7 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
     bool IsAtomic;
     SDValue BasePtr;
     int64_t Offset;
-    std::optional<int64_t> NumBytes;
+    LocationSize NumBytes;
     MachineMemOperand *MMO;
   };
 
@@ -27843,18 +27843,18 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
     if (const auto *LSN = dyn_cast<LSBaseSDNode>(N)) {
       int64_t Offset = 0;
       if (auto *C = dyn_cast<ConstantSDNode>(LSN->getOffset()))
-        Offset = (LSN->getAddressingMode() == ISD::PRE_INC)
-                     ? C->getSExtValue()
-                     : (LSN->getAddressingMode() == ISD::PRE_DEC)
-                           ? -1 * C->getSExtValue()
-                           : 0;
+        Offset = (LSN->getAddressingMode() == ISD::PRE_INC) ? C->getSExtValue()
+                 : (LSN->getAddressingMode() == ISD::PRE_DEC)
+                     ? -1 * C->getSExtValue()
+                     : 0;
       uint64_t Size =
           MemoryLocation::getSizeOrUnknown(LSN->getMemoryVT().getStoreSize());
       return {LSN->isVolatile(),
               LSN->isAtomic(),
               LSN->getBasePtr(),
               Offset /*base offset*/,
-              std::optional<int64_t>(Size),
+              Size != ~UINT64_C(0) ? LocationSize::precise(Size)
+                                   : LocationSize::beforeOrAfterPointer(),
               LSN->getMemOperand()};
     }
     if (const auto *LN = cast<LifetimeSDNode>(N))
@@ -27862,13 +27862,15 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
               /*isAtomic*/ false,
               LN->getOperand(1),
               (LN->hasOffset()) ? LN->getOffset() : 0,
-              (LN->hasOffset()) ? std::optional<int64_t>(LN->getSize())
-                                : std::optional<int64_t>(),
+              (LN->hasOffset()) ? LocationSize::precise(LN->getSize())
+                                : LocationSize::beforeOrAfterPointer(),
               (MachineMemOperand *)nullptr};
     // Default.
     return {false /*isvolatile*/,
-            /*isAtomic*/ false,          SDValue(),
-            (int64_t)0 /*offset*/,       std::optional<int64_t>() /*size*/,
+            /*isAtomic*/ false,
+            SDValue(),
+            (int64_t)0 /*offset*/,
+            LocationSize::beforeOrAfterPointer() /*size*/,
             (MachineMemOperand *)nullptr};
   };
 
@@ -27923,18 +27925,20 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
   int64_t SrcValOffset1 = MUC1.MMO->getOffset();
   Align OrigAlignment0 = MUC0.MMO->getBaseAlign();
   Align OrigAlignment1 = MUC1.MMO->getBaseAlign();
-  auto &Size0 = MUC0.NumBytes;
-  auto &Size1 = MUC1.NumBytes;
+  LocationSize Size0 = MUC0.NumBytes;
+  LocationSize Size1 = MUC1.NumBytes;
   if (OrigAlignment0 == OrigAlignment1 && SrcValOffset0 != SrcValOffset1 &&
-      Size0.has_value() && Size1.has_value() && *Size0 == *Size1 &&
-      OrigAlignment0 > *Size0 && SrcValOffset0 % *Size0 == 0 &&
-      SrcValOffset1 % *Size1 == 0) {
+      Size0.hasValue() && Size1.hasValue() && Size0 == Size1 &&
+      OrigAlignment0 > Size0.getValue() &&
+      SrcValOffset0 % Size0.getValue() == 0 &&
+      SrcValOffset1 % Size1.getValue() == 0) {
     int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0.value();
     int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1.value();
 
     // There is no overlap between these relatively aligned accesses of
     // similar size. Return no alias.
-    if ((OffAlign0 + *Size0) <= OffAlign1 || (OffAlign1 + *Size1) <= OffAlign0)
+    if ((OffAlign0 + (int64_t)Size0.getValue()) <= OffAlign1 ||
+        (OffAlign1 + (int64_t)Size1.getValue()) <= OffAlign0)
       return false;
   }
 
@@ -27947,12 +27951,12 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
     UseAA = false;
 #endif
 
-  if (UseAA && AA && MUC0.MMO->getValue() && MUC1.MMO->getValue() && Size0 &&
-      Size1) {
+  if (UseAA && AA && MUC0.MMO->getValue() && MUC1.MMO->getValue() &&
+      Size0.hasValue() && Size1.hasValue()) {
     // Use alias analysis information.
     int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1);
-    int64_t Overlap0 = *Size0 + SrcValOffset0 - MinOffset;
-    int64_t Overlap1 = *Size1 + SrcValOffset1 - MinOffset;
+    int64_t Overlap0 = Size0.getValue() + SrcValOffset0 - MinOffset;
+    int64_t Overlap1 = Size1.getValue() + SrcValOffset1 - MinOffset;
     if (AA->isNoAlias(
             MemoryLocation(MUC0.MMO->getValue(), Overlap0,
                            UseTBAA ? MUC0.MMO->getAAInfo() : AAMDNodes()),
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index df17d6530b0d..6e55acd22bb3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -1354,10 +1354,13 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ZExtIntBinOp(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_UMINUMAX(SDNode *N) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
   // It doesn't matter if we sign extend or zero extend in the inputs. So do
-  // whatever is best for the target.
-  SDValue LHS = SExtOrZExtPromotedInteger(N->getOperand(0));
-  SDValue RHS = SExtOrZExtPromotedInteger(N->getOperand(1));
+  // whatever is best for the target and the promoted operands.
+  SExtOrZExtPromotedOperands(LHS, RHS);
+
   return DAG.getNode(N->getOpcode(), SDLoc(N),
                      LHS.getValueType(), LHS, RHS);
 }
@@ -1922,25 +1925,10 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   return false;
 }
 
-/// PromoteSetCCOperands - Promote the operands of a comparison.  This code is
-/// shared among BR_CC, SELECT_CC, and SETCC handlers.
-void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &LHS, SDValue &RHS,
-                                            ISD::CondCode CCCode) {
-  // We have to insert explicit sign or zero extends. Note that we could
-  // insert sign extends for ALL conditions. For those operations where either
-  // zero or sign extension would be valid, we ask the target which extension
-  // it would prefer.
-
-  // Signed comparisons always require sign extension.
-  if (ISD::isSignedIntSetCC(CCCode)) {
-    LHS = SExtPromotedInteger(LHS);
-    RHS = SExtPromotedInteger(RHS);
-    return;
-  }
-
-  assert((ISD::isUnsignedIntSetCC(CCCode) || ISD::isIntEqualitySetCC(CCCode)) &&
-         "Unknown integer comparison!");
-
+// These operands can be either sign extended or zero extended as long as we
+// treat them the same. If an extension is free, choose that. Otherwise, follow
+// target preference.
+void DAGTypeLegalizer::SExtOrZExtPromotedOperands(SDValue &LHS, SDValue &RHS) {
   SDValue OpL = GetPromotedInteger(LHS);
   SDValue OpR = GetPromotedInteger(RHS);
 
@@ -1984,6 +1972,28 @@ void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &LHS, SDValue &RHS,
   RHS = ZExtPromotedInteger(RHS);
 }
 
+/// PromoteSetCCOperands - Promote the operands of a comparison.  This code is
+/// shared among BR_CC, SELECT_CC, and SETCC handlers.
+void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &LHS, SDValue &RHS,
+                                            ISD::CondCode CCCode) {
+  // We have to insert explicit sign or zero extends. Note that we could
+  // insert sign extends for ALL conditions. For those operations where either
+  // zero or sign extension would be valid, we ask the target which extension
+  // it would prefer.
+
+  // Signed comparisons always require sign extension.
+  if (ISD::isSignedIntSetCC(CCCode)) {
+    LHS = SExtPromotedInteger(LHS);
+    RHS = SExtPromotedInteger(RHS);
+    return;
+  }
+
+  assert((ISD::isUnsignedIntSetCC(CCCode) || ISD::isIntEqualitySetCC(CCCode)) &&
+         "Unknown integer comparison!");
+
+  SExtOrZExtPromotedOperands(LHS, RHS);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntOp_ANY_EXTEND(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), N->getValueType(0), Op);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 3c84f67653ec..e08acd36b41d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -275,20 +275,6 @@ private:
     return DAG.getZeroExtendInReg(Op, dl, OldVT);
   }
 
-  // Get a promoted operand and sign or zero extend it to the final size
-  // (depending on TargetLoweringInfo::isSExtCheaperThanZExt). For a given
-  // subtarget and type, the choice of sign or zero-extension will be
-  // consistent.
-  SDValue SExtOrZExtPromotedInteger(SDValue Op) {
-    EVT OldVT = Op.getValueType();
-    SDLoc DL(Op);
-    Op = GetPromotedInteger(Op);
-    if (TLI.isSExtCheaperThanZExt(OldVT, Op.getValueType()))
-      return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, Op.getValueType(), Op,
-                         DAG.getValueType(OldVT));
-    return DAG.getZeroExtendInReg(Op, DL, OldVT);
-  }
-
   // Promote the given operand V (vector or scalar) according to N's specific
   // reduction kind. N must be an integer VECREDUCE_* or VP_REDUCE_*. Returns
   // the nominal extension opcode (ISD::(ANY|ZERO|SIGN)_EXTEND) and the
@@ -415,6 +401,7 @@ private:
   SDValue PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_VP_SPLICE(SDNode *N, unsigned OpNo);
 
+  void SExtOrZExtPromotedOperands(SDValue &LHS, SDValue &RHS);
   void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
 
   //===--------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 0ceda27d4066..e150f27240d7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4004,6 +4004,18 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
 
     break;
   }
+  case ISD::UINT_TO_FP: {
+    Known.makeNonNegative();
+    break;
+  }
+  case ISD::SINT_TO_FP: {
+    Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    if (Known2.isNonNegative())
+      Known.makeNonNegative();
+    else if (Known2.isNegative())
+      Known.makeNegative();
+    break;
+  }
   case ISD::FP_TO_UINT_SAT: {
     // FP_TO_UINT_SAT produces an unsigned value that fits in the saturating VT.
     EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
index 66825d845c19..9670c3ac8430 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -91,11 +91,10 @@ bool BaseIndexOffset::equalBaseIndex(const BaseIndexOffset &Other,
 }
 
 bool BaseIndexOffset::computeAliasing(const SDNode *Op0,
-                                      const std::optional<int64_t> NumBytes0,
+                                      const LocationSize NumBytes0,
                                       const SDNode *Op1,
-                                      const std::optional<int64_t> NumBytes1,
+                                      const LocationSize NumBytes1,
                                       const SelectionDAG &DAG, bool &IsAlias) {
-
   BaseIndexOffset BasePtr0 = match(Op0, DAG);
   if (!BasePtr0.getBase().getNode())
     return false;
@@ -105,27 +104,26 @@ bool BaseIndexOffset::computeAliasing(const SDNode *Op0,
     return false;
 
   int64_t PtrDiff;
-  if (NumBytes0 && NumBytes1 &&
-      BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff)) {
+  if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff)) {
     // If the size of memory access is unknown, do not use it to analysis.
     // One example of unknown size memory access is to load/store scalable
     // vector objects on the stack.
     // BasePtr1 is PtrDiff away from BasePtr0. They alias if none of the
     // following situations arise:
-    if (PtrDiff >= 0 &&
-        *NumBytes0 != static_cast<int64_t>(MemoryLocation::UnknownSize)) {
+    if (PtrDiff >= 0 && NumBytes0.hasValue() && !NumBytes0.isScalable()) {
       // [----BasePtr0----]
       //                         [---BasePtr1--]
       // ========PtrDiff========>
-      IsAlias = !(*NumBytes0 <= PtrDiff);
+      IsAlias = !(static_cast<int64_t>(NumBytes0.getValue().getFixedValue()) <=
+                  PtrDiff);
       return true;
     }
-    if (PtrDiff < 0 &&
-        *NumBytes1 != static_cast<int64_t>(MemoryLocation::UnknownSize)) {
+    if (PtrDiff < 0 && NumBytes1.hasValue() && !NumBytes1.isScalable()) {
       //                     [----BasePtr0----]
       // [---BasePtr1--]
       // =====(-PtrDiff)====>
-      IsAlias = !((PtrDiff + *NumBytes1) <= 0);
+      IsAlias = !((PtrDiff + static_cast<int64_t>(
+                                 NumBytes1.getValue().getFixedValue())) <= 0);
       return true;
     }
     return false;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ee600d389c2c..ab2f42d2024c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8030,8 +8030,9 @@ void SelectionDAGBuilder::visitVPStridedLoad(
   MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo);
   bool AddToChain = !AA || !AA->pointsToConstantMemory(ML);
   SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
+  unsigned AS = PtrOperand->getType()->getPointerAddressSpace();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-      MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad,
+      MachinePointerInfo(AS), MachineMemOperand::MOLoad,
       MemoryLocation::UnknownSize, *Alignment, AAInfo, Ranges);
 
   SDValue LD = DAG.getStridedLoadVP(VT, DL, InChain, OpValues[0], OpValues[1],
@@ -8052,8 +8053,9 @@ void SelectionDAGBuilder::visitVPStridedStore(
   if (!Alignment)
     Alignment = DAG.getEVTAlign(VT.getScalarType());
   AAMDNodes AAInfo = VPIntrin.getAAMetadata();
+  unsigned AS = PtrOperand->getType()->getPointerAddressSpace();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-      MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore,
+      MachinePointerInfo(AS), MachineMemOperand::MOStore,
       MemoryLocation::UnknownSize, *Alignment, AAInfo);
 
   SDValue ST = DAG.getStridedStoreVP(
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index 20ef59e7b442..520debe513d9 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -29,7 +29,9 @@
 #include "llvm/Support/DJB.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/JSON.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <map>
@@ -2026,12 +2028,37 @@ void OutputCategoryAggregator::EnumerateResults(
 }
 
 void DWARFVerifier::summarize() {
-  if (ErrorCategory.GetNumCategories() && DumpOpts.ShowAggregateErrors) {
+  if (DumpOpts.ShowAggregateErrors && ErrorCategory.GetNumCategories()) {
     error() << "Aggregated error counts:\n";
     ErrorCategory.EnumerateResults([&](StringRef s, unsigned count) {
       error() << s << " occurred " << count << " time(s).\n";
     });
   }
+  if (!DumpOpts.JsonErrSummaryFile.empty()) {
+    std::error_code EC;
+    raw_fd_ostream JsonStream(DumpOpts.JsonErrSummaryFile, EC,
+                              sys::fs::OF_Text);
+    if (EC) {
+      error() << "unable to open json summary file '"
+              << DumpOpts.JsonErrSummaryFile
+              << "' for writing: " << EC.message() << '\n';
+      return;
+    }
+
+    llvm::json::Object Categories;
+    uint64_t ErrorCount = 0;
+    ErrorCategory.EnumerateResults([&](StringRef Category, unsigned Count) {
+      llvm::json::Object Val;
+      Val.try_emplace("count", Count);
+      Categories.try_emplace(Category, std::move(Val));
+      ErrorCount += Count;
+    });
+    llvm::json::Object RootNode;
+    RootNode.try_emplace("error-categories", std::move(Categories));
+    RootNode.try_emplace("error-count", ErrorCount);
+
+    JsonStream << llvm::json::Value(std::move(RootNode));
+  }
 }
 
 raw_ostream &DWARFVerifier::error() const { return WithColor::error(OS); }
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 09f59c81123e..d65ed8c11d86 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4047,13 +4047,17 @@ OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc,
 
 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createSingle(
     const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
-    FinalizeCallbackTy FiniCB, bool IsNowait, llvm::Value *DidIt) {
+    FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
+    ArrayRef<llvm::Function *> CPFuncs) {
 
   if (!updateToLocation(Loc))
     return Loc.IP;
 
-  // If needed (i.e. not null), initialize `DidIt` with 0
-  if (DidIt) {
+  // If needed allocate and initialize `DidIt` with 0.
+  // DidIt: flag variable: 1=single thread; 0=not single thread.
+  llvm::Value *DidIt = nullptr;
+  if (!CPVars.empty()) {
+    DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
     Builder.CreateStore(Builder.getInt32(0), DidIt);
   }
 
@@ -4070,17 +4074,36 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createSingle(
   Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
   Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
 
+  auto FiniCBWrapper = [&](InsertPointTy IP) {
+    FiniCB(IP);
+
+    // The thread that executes the single region must set `DidIt` to 1.
+    // This is used by __kmpc_copyprivate, to know if the caller is the
+    // single thread or not.
+    if (DidIt)
+      Builder.CreateStore(Builder.getInt32(1), DidIt);
+  };
+
   // generates the following:
   // if (__kmpc_single()) {
   //		.... single region ...
   // 		__kmpc_end_single
   // }
+  // __kmpc_copyprivate
   // __kmpc_barrier
 
-  EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
+  EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
                        /*Conditional*/ true,
                        /*hasFinalize*/ true);
-  if (!IsNowait)
+
+  if (DidIt) {
+    for (size_t I = 0, E = CPVars.size(); I < E; ++I)
+      // NOTE BufSize is currently unused, so just pass 0.
+      createCopyPrivate(LocationDescription(Builder.saveIP(), Loc.DL),
+                        /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
+                        CPFuncs[I], DidIt);
+    // NOTE __kmpc_copyprivate already inserts a barrier
+  } else if (!IsNowait)
     createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
                   omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
                   /* CheckCancelFlag */ false);
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index fba404c9b027..4e1e48b4ad4a 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -861,7 +861,7 @@ private:
   /// Add all of the metadata from an instruction.
   void processInstructionMetadata(const Instruction &I);
 
-  /// Add all of the metadata from an instruction.
+  /// Add all of the metadata from a DbgRecord.
   void processDbgRecordMetadata(const DbgRecord &DPV);
 };
 
@@ -1140,15 +1140,18 @@ void SlotTracker::processFunctionMetadata(const Function &F) {
 
 void SlotTracker::processDbgRecordMetadata(const DbgRecord &DR) {
   if (const DPValue *DPV = dyn_cast<const DPValue>(&DR)) {
-    CreateMetadataSlot(DPV->getVariable());
+    // Process metadata used by DbgRecords; we only specifically care about the
+    // DILocalVariable, DILocation, and DIAssignID fields, as the Value and
+    // Expression fields should only be printed inline and so do not use a slot.
+    CreateMetadataSlot(DPV->getRawVariable());
     if (DPV->isDbgAssign())
-      CreateMetadataSlot(DPV->getAssignID());
+      CreateMetadataSlot(cast<MDNode>(DPV->getRawAssignID()));
   } else if (const DPLabel *DPL = dyn_cast<const DPLabel>(&DR)) {
     CreateMetadataSlot(DPL->getLabel());
   } else {
     llvm_unreachable("unsupported DbgRecord kind");
   }
-  CreateMetadataSlot(DR.getDebugLoc());
+  CreateMetadataSlot(DR.getDebugLoc().getAsMDNode());
 }
 
 void SlotTracker::processInstructionMetadata(const Instruction &I) {
@@ -2703,6 +2706,7 @@ public:
   void printDPValue(const DPValue &DPI);
   void printDPLabel(const DPLabel &DPL);
   void printDbgRecord(const DbgRecord &DPI);
+  void printDbgRecordLine(const DbgRecord &DPI);
 
   void printUseListOrder(const Value *V, const std::vector<unsigned> &Shuffle);
   void printUseLists(const Function *F);
@@ -3885,9 +3889,6 @@ void AssemblyWriter::printTypeIdentities() {
 
 /// printFunction - Print all aspects of a function.
 void AssemblyWriter::printFunction(const Function *F) {
-  bool ConvertBack = F->IsNewDbgInfoFormat;
-  if (ConvertBack)
-    const_cast<Function *>(F)->convertFromNewDbgValues();
   if (AnnotationWriter) AnnotationWriter->emitFunctionAnnot(F, Out);
 
   if (F->isMaterializable())
@@ -4030,8 +4031,6 @@ void AssemblyWriter::printFunction(const Function *F) {
     Out << "}\n";
   }
 
-  if (ConvertBack)
-    const_cast<Function *>(F)->convertToNewDbgValues();
   Machine.purgeFunction();
 }
 
@@ -4098,6 +4097,8 @@ void AssemblyWriter::printBasicBlock(const BasicBlock *BB) {
 
   // Output all of the instructions in the basic block...
   for (const Instruction &I : *BB) {
+    for (const DbgRecord &DR : I.getDbgValueRange())
+      printDbgRecordLine(DR);
     printInstructionLine(I);
   }
 
@@ -4611,12 +4612,10 @@ void AssemblyWriter::printDbgRecord(const DbgRecord &DR) {
     llvm_unreachable("Unexpected DbgRecord kind");
 }
 
-void AssemblyWriter::printDPValue(const DPValue &Value) {
-  // There's no formal representation of a DPValue -- print purely as a
-  // debugging aid.
-  Out << "  DPValue ";
-
-  switch (Value.getType()) {
+void AssemblyWriter::printDPValue(const DPValue &DPV) {
+  auto WriterCtx = getContext();
+  Out << "#dbg_";
+  switch (DPV.getType()) {
   case DPValue::LocationType::Value:
     Out << "value";
     break;
@@ -4629,35 +4628,39 @@ void AssemblyWriter::printDPValue(const DPValue &Value) {
   default:
     llvm_unreachable("Tried to print a DPValue with an invalid LocationType!");
   }
-  Out << " { ";
-  auto WriterCtx = getContext();
-  WriteAsOperandInternal(Out, Value.getRawLocation(), WriterCtx, true);
+  Out << "(";
+  WriteAsOperandInternal(Out, DPV.getRawLocation(), WriterCtx, true);
   Out << ", ";
-  WriteAsOperandInternal(Out, Value.getVariable(), WriterCtx, true);
+  WriteAsOperandInternal(Out, DPV.getVariable(), WriterCtx, true);
   Out << ", ";
-  WriteAsOperandInternal(Out, Value.getExpression(), WriterCtx, true);
+  WriteAsOperandInternal(Out, DPV.getExpression(), WriterCtx, true);
   Out << ", ";
-  if (Value.isDbgAssign()) {
-    WriteAsOperandInternal(Out, Value.getAssignID(), WriterCtx, true);
+  if (DPV.isDbgAssign()) {
+    WriteAsOperandInternal(Out, DPV.getAssignID(), WriterCtx, true);
     Out << ", ";
-    WriteAsOperandInternal(Out, Value.getRawAddress(), WriterCtx, true);
+    WriteAsOperandInternal(Out, DPV.getRawAddress(), WriterCtx, true);
     Out << ", ";
-    WriteAsOperandInternal(Out, Value.getAddressExpression(), WriterCtx, true);
+    WriteAsOperandInternal(Out, DPV.getAddressExpression(), WriterCtx, true);
     Out << ", ";
   }
-  WriteAsOperandInternal(Out, Value.getDebugLoc().get(), WriterCtx, true);
-  Out << " marker @" << Value.getMarker();
-  Out << " }";
+  WriteAsOperandInternal(Out, DPV.getDebugLoc().getAsMDNode(), WriterCtx, true);
+  Out << ")";
+}
+
+/// printDbgRecordLine - Print a DbgRecord with indentation and a newline
+/// character.
+void AssemblyWriter::printDbgRecordLine(const DbgRecord &DR) {
+  // Print lengthier indentation to bring out-of-line with instructions.
+  Out << "    ";
+  printDbgRecord(DR);
+  Out << '\n';
 }
 
 void AssemblyWriter::printDPLabel(const DPLabel &Label) {
-  // There's no formal representation of a DPLabel -- print purely as
-  // a debugging aid.
-  Out << "  DPLabel { ";
   auto WriterCtx = getContext();
+  Out << "#dbg_label(";
   WriteAsOperandInternal(Out, Label.getLabel(), WriterCtx, true);
-  Out << " marker @" << Label.getMarker();
-  Out << " }";
+  Out << ")";
 }
 
 void AssemblyWriter::printMetadataAttachments(
@@ -4805,19 +4808,11 @@ void BasicBlock::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
 
 void Module::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
                    bool ShouldPreserveUseListOrder, bool IsForDebug) const {
-  // RemoveDIs: always print with debug-info in intrinsic format.
-  bool ConvertAfter = IsNewDbgInfoFormat;
-  if (IsNewDbgInfoFormat)
-    const_cast<Module *>(this)->convertFromNewDbgValues();
-
   SlotTracker SlotTable(this);
   formatted_raw_ostream OS(ROS);
   AssemblyWriter W(OS, SlotTable, this, AAW, IsForDebug,
                    ShouldPreserveUseListOrder);
   W.printModule(this);
-
-  if (ConvertAfter)
-    const_cast<Module *>(this)->convertToNewDbgValues();
 }
 
 void NamedMDNode::print(raw_ostream &ROS, bool IsForDebug) const {
@@ -4908,8 +4903,6 @@ void DPValue::print(raw_ostream &ROS, bool IsForDebug) const {
 
 void DPMarker::print(raw_ostream &ROS, ModuleSlotTracker &MST,
                      bool IsForDebug) const {
-  // There's no formal representation of a DPMarker -- print purely as a
-  // debugging aid.
   formatted_raw_ostream OS(ROS);
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
@@ -4931,8 +4924,6 @@ void DPLabel::print(raw_ostream &ROS, bool IsForDebug) const {
 
 void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST,
                     bool IsForDebug) const {
-  // There's no formal representation of a DPValue -- print purely as a
-  // debugging aid.
   formatted_raw_ostream OS(ROS);
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
@@ -4950,8 +4941,6 @@ void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST,
 
 void DPLabel::print(raw_ostream &ROS, ModuleSlotTracker &MST,
                     bool IsForDebug) const {
-  // There's no formal representation of a DbgLabelRecord -- print purely as
-  // a debugging aid.
   formatted_raw_ostream OS(ROS);
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 6ea876fde5ec..25aa32611645 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -61,10 +61,6 @@ DPMarker *BasicBlock::createMarker(InstListType::iterator It) {
 }
 
 void BasicBlock::convertToNewDbgValues() {
-  // Is the command line option set?
-  if (!UseNewDbgInfoFormat)
-    return;
-
   IsNewDbgInfoFormat = true;
 
   // Iterate over all instructions in the instruction list, collecting dbg.value
@@ -126,67 +122,6 @@ void BasicBlock::convertFromNewDbgValues() {
   assert(!getTrailingDPValues());
 }
 
-bool BasicBlock::validateDbgValues(bool Assert, bool Msg, raw_ostream *OS) {
-  bool RetVal = false;
-  if (!OS)
-    OS = &errs();
-
-  // Helper lambda for reporting failures: via assertion, printing, and return
-  // value.
-  auto TestFailure = [Assert, Msg, &RetVal, OS](bool Val, const char *Text) {
-    // Did the test fail?
-    if (Val)
-      return;
-
-    // If we're asserting, then fire off an assertion.
-    if (Assert)
-      llvm_unreachable(Text);
-
-    if (Msg)
-      *OS << Text << "\n";
-    RetVal = true;
-  };
-
-  // We should have the same debug-format as the parent function.
-  TestFailure(getParent()->IsNewDbgInfoFormat == IsNewDbgInfoFormat,
-              "Parent function doesn't have the same debug-info format");
-
-  // Only validate if we are using the new format.
-  if (!IsNewDbgInfoFormat)
-    return RetVal;
-
-  // Match every DPMarker to every Instruction and vice versa, and
-  // verify that there are no invalid DPValues.
-  for (auto It = begin(); It != end(); ++It) {
-    if (!It->DbgMarker)
-      continue;
-
-    // Validate DebugProgramMarkers.
-    DPMarker *CurrentDebugMarker = It->DbgMarker;
-
-    // If this is a marker, it should match the instruction and vice versa.
-    TestFailure(CurrentDebugMarker->MarkedInstr == &*It,
-                "Debug Marker points to incorrect instruction?");
-
-    // Now validate any DPValues in the marker.
-    for (DbgRecord &DPR : CurrentDebugMarker->getDbgValueRange()) {
-      // Validate DebugProgramValues.
-      TestFailure(DPR.getMarker() == CurrentDebugMarker,
-                  "Not pointing at correct next marker!");
-
-      // Verify that no DbgValues appear prior to PHIs.
-      TestFailure(
-          !isa<PHINode>(It),
-          "DebugProgramValues must not appear before PHI nodes in a block!");
-    }
-  }
-
-  // Except transiently when removing + re-inserting the block terminator, there
-  // should be no trailing DPValues.
-  TestFailure(!getTrailingDPValues(), "Trailing DPValues in block");
-  return RetVal;
-}
-
 #ifndef NDEBUG
 void BasicBlock::dumpDbgValues() const {
   for (auto &Inst : *this) {
diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp
index 389bac4de6a1..3a8b94a87bbc 100644
--- a/llvm/lib/IR/DebugProgramInstruction.cpp
+++ b/llvm/lib/IR/DebugProgramInstruction.cpp
@@ -175,6 +175,8 @@ DPValue *DPValue::createLinkedDPVAssign(Instruction *LinkedInstr, Value *Val,
   return NewDPVAssign;
 }
 
+void DPValue::setVariable(DILocalVariable *NewVar) { Variable.reset(NewVar); }
+
 iterator_range<DPValue::location_op_iterator> DPValue::location_ops() const {
   auto *MD = getRawLocation();
   // If a Value has been deleted, the "location" for this DPValue will be
@@ -313,6 +315,10 @@ bool DPValue::isKillLocation() const {
          any_of(location_ops(), [](Value *V) { return isa<UndefValue>(V); });
 }
 
+DILocalVariable *DPValue::getVariable() const {
+  return cast<DILocalVariable>(Variable.get());
+}
+
 std::optional<uint64_t> DPValue::getFragmentSizeInBits() const {
   if (auto Fragment = getExpression()->getFragmentInfo())
     return Fragment->SizeInBits;
diff --git a/llvm/lib/IR/IRPrintingPasses.cpp b/llvm/lib/IR/IRPrintingPasses.cpp
index b19210e776ed..84fb8e6c66b8 100644
--- a/llvm/lib/IR/IRPrintingPasses.cpp
+++ b/llvm/lib/IR/IRPrintingPasses.cpp
@@ -23,6 +23,11 @@
 
 using namespace llvm;
 
+cl::opt<bool> WriteNewDbgInfoFormat(
+    "write-experimental-debuginfo",
+    cl::desc("Write debug info in the new non-intrinsic format"),
+    cl::init(false));
+
 namespace {
 
 class PrintModulePassWrapper : public ModulePass {
@@ -39,11 +44,14 @@ public:
         ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {}
 
   bool runOnModule(Module &M) override {
-    // RemoveDIs: there's no textual representation of the DPValue debug-info,
-    // convert to dbg.values before writing out.
-    bool IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
-    if (IsNewDbgInfoFormat)
-      M.convertFromNewDbgValues();
+    // RemoveDIs: Regardless of the format we've processed this module in, use
+    // `WriteNewDbgInfoFormat` to determine which format we use to write it.
+    ScopedDbgInfoFormatSetter FormatSetter(M, WriteNewDbgInfoFormat);
+    // Remove intrinsic declarations when printing in the new format.
+    // TODO: Move this into Module::setIsNewDbgInfoFormat when we're ready to
+    // update test output.
+    if (WriteNewDbgInfoFormat)
+      M.removeDebugIntrinsicDeclarations();
 
     if (llvm::isFunctionInPrintList("*")) {
       if (!Banner.empty())
@@ -62,9 +70,6 @@ public:
       }
     }
 
-    if (IsNewDbgInfoFormat)
-      M.convertToNewDbgValues();
-
     return false;
   }
 
@@ -87,11 +92,9 @@ public:
 
   // This pass just prints a banner followed by the function as it's processed.
   bool runOnFunction(Function &F) override {
-    // RemoveDIs: there's no textual representation of the DPValue debug-info,
-    // convert to dbg.values before writing out.
-    bool IsNewDbgInfoFormat = F.IsNewDbgInfoFormat;
-    if (IsNewDbgInfoFormat)
-      F.convertFromNewDbgValues();
+    // RemoveDIs: Regardless of the format we've processed this function in, use
+    // `WriteNewDbgInfoFormat` to determine which format we use to write it.
+    ScopedDbgInfoFormatSetter FormatSetter(F, WriteNewDbgInfoFormat);
 
     if (isFunctionInPrintList(F.getName())) {
       if (forcePrintModuleIR())
@@ -101,9 +104,6 @@ public:
         OS << Banner << '\n' << static_cast<Value &>(F);
     }
 
-    if (IsNewDbgInfoFormat)
-      F.convertToNewDbgValues();
-
     return false;
   }
 
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 1946db2ee0be..a8696ed9e3ce 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -85,6 +85,28 @@ Module::~Module() {
   IFuncList.clear();
 }
 
+void Module::removeDebugIntrinsicDeclarations() {
+  auto *DeclareIntrinsicFn =
+      Intrinsic::getDeclaration(this, Intrinsic::dbg_declare);
+  assert((!isMaterialized() || DeclareIntrinsicFn->hasZeroLiveUses()) &&
+         "Debug declare intrinsic should have had uses removed.");
+  DeclareIntrinsicFn->eraseFromParent();
+  auto *ValueIntrinsicFn =
+      Intrinsic::getDeclaration(this, Intrinsic::dbg_value);
+  assert((!isMaterialized() || ValueIntrinsicFn->hasZeroLiveUses()) &&
+         "Debug value intrinsic should have had uses removed.");
+  ValueIntrinsicFn->eraseFromParent();
+  auto *AssignIntrinsicFn =
+      Intrinsic::getDeclaration(this, Intrinsic::dbg_assign);
+  assert((!isMaterialized() || AssignIntrinsicFn->hasZeroLiveUses()) &&
+         "Debug assign intrinsic should have had uses removed.");
+  AssignIntrinsicFn->eraseFromParent();
+  auto *LabelntrinsicFn = Intrinsic::getDeclaration(this, Intrinsic::dbg_label);
+  assert((!isMaterialized() || LabelntrinsicFn->hasZeroLiveUses()) &&
+         "Debug label intrinsic should have had uses removed.");
+  LabelntrinsicFn->eraseFromParent();
+}
+
 std::unique_ptr<RandomNumberGenerator>
 Module::createRNG(const StringRef Name) const {
   SmallString<32> Salt(Name);
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 4f321bc516cc..3741e5deaa4c 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -173,11 +173,36 @@ private:
     }
   }
 
+  void Write(const DbgRecord *DR) {
+    if (DR)
+      DR->print(*OS, MST, false);
+  }
+
   void Write(const DPValue *V) {
     if (V)
       V->print(*OS, MST, false);
   }
 
+  void Write(DPValue::LocationType Type) {
+    switch (Type) {
+    case DPValue::LocationType::Value:
+      *OS << "value";
+      break;
+    case DPValue::LocationType::Declare:
+      *OS << "declare";
+      break;
+    case DPValue::LocationType::Assign:
+      *OS << "assign";
+      break;
+    case DPValue::LocationType::End:
+      *OS << "end";
+      break;
+    case DPValue::LocationType::Any:
+      *OS << "any";
+      break;
+    };
+  }
+
   void Write(const Metadata *MD) {
     if (!MD)
       return;
@@ -522,8 +547,10 @@ private:
 
   void visitTemplateParams(const MDNode &N, const Metadata &RawParams);
 
+  void visit(DPValue &DPV);
   // InstVisitor overrides...
   using InstVisitor<Verifier>::visit;
+  void visitDbgRecords(Instruction &I);
   void visit(Instruction &I);
 
   void visitTruncInst(TruncInst &I);
@@ -649,7 +676,22 @@ private:
     }                                                                          \
   } while (false)
 
+void Verifier::visitDbgRecords(Instruction &I) {
+  if (!I.DbgMarker)
+    return;
+  CheckDI(I.DbgMarker->MarkedInstr == &I, "Instruction has invalid DbgMarker",
+          &I);
+  CheckDI(!isa<PHINode>(&I) || !I.hasDbgValues(),
+          "PHI Node must not have any attached DbgRecords", &I);
+  for (DPValue &DPV : DPValue::filter(I.getDbgValueRange())) {
+    CheckDI(DPV.getMarker() == I.DbgMarker, "DbgRecord had invalid DbgMarker",
+            &I, &DPV);
+    visit(DPV);
+  }
+}
+
 void Verifier::visit(Instruction &I) {
+  visitDbgRecords(I);
   for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i)
     Check(I.getOperand(i) != nullptr, "Operand is null", &I);
   InstVisitor<Verifier>::visit(I);
@@ -2976,12 +3018,9 @@ void Verifier::visitBasicBlock(BasicBlock &BB) {
   }
 
   // Confirm that no issues arise from the debug program.
-  if (BB.IsNewDbgInfoFormat) {
-    // Configure the validate function to not fire assertions, instead print
-    // errors and return true if there's a problem.
-    bool RetVal = BB.validateDbgValues(false, true, OS);
-    Check(!RetVal, "Invalid configuration of new-debug-info data found");
-  }
+  if (BB.IsNewDbgInfoFormat)
+    CheckDI(!BB.getTrailingDPValues(), "Basic Block has trailing DbgRecords!",
+            &BB);
 }
 
 void Verifier::visitTerminator(Instruction &I) {
@@ -6148,6 +6187,70 @@ static DISubprogram *getSubprogram(Metadata *LocalScope) {
   return nullptr;
 }
 
+void Verifier::visit(DPValue &DPV) {
+  CheckDI(DPV.getType() == DPValue::LocationType::Value ||
+              DPV.getType() == DPValue::LocationType::Declare ||
+              DPV.getType() == DPValue::LocationType::Assign,
+          "invalid #dbg record type", &DPV, DPV.getType());
+  // The location for a DPValue must be either a ValueAsMetadata, DIArgList, or
+  // an empty MDNode (which is a legacy representation for an "undef" location).
+  auto *MD = DPV.getRawLocation();
+  CheckDI(isa<ValueAsMetadata>(MD) || isa<DIArgList>(MD) ||
+              (isa<MDNode>(MD) && !cast<MDNode>(MD)->getNumOperands()),
+          "invalid #dbg record address/value", &DPV, MD);
+  CheckDI(isa<DILocalVariable>(DPV.getRawVariable()),
+          "invalid #dbg record variable", &DPV, DPV.getRawVariable());
+  CheckDI(DPV.getExpression(), "missing #dbg record expression", &DPV,
+          DPV.getExpression());
+
+  if (DPV.isDbgAssign()) {
+    CheckDI(isa<DIAssignID>(DPV.getRawAssignID()),
+            "invalid #dbg_assign DIAssignID", &DPV, DPV.getRawAssignID());
+    const auto *RawAddr = DPV.getRawAddress();
+    // Similarly to the location above, the address for an assign DPValue must
+    // be a ValueAsMetadata or an empty MDNode, which represents an undef
+    // address.
+    CheckDI(
+        isa<ValueAsMetadata>(RawAddr) ||
+            (isa<MDNode>(RawAddr) && !cast<MDNode>(RawAddr)->getNumOperands()),
+        "invalid #dbg_assign address", &DPV, DPV.getRawAddress());
+    CheckDI(DPV.getAddressExpression(),
+            "missing #dbg_assign address expression", &DPV,
+            DPV.getAddressExpression());
+    // All of the linked instructions should be in the same function as DPV.
+    for (Instruction *I : at::getAssignmentInsts(&DPV))
+      CheckDI(DPV.getFunction() == I->getFunction(),
+              "inst not in same function as #dbg_assign", I, &DPV);
+  }
+
+  if (MDNode *N = DPV.getDebugLoc().getAsMDNode()) {
+    CheckDI(isa<DILocation>(N), "invalid #dbg record location", &DPV, N);
+    visitDILocation(*cast<DILocation>(N));
+  }
+
+  BasicBlock *BB = DPV.getParent();
+  Function *F = BB ? BB->getParent() : nullptr;
+
+  // The scopes for variables and !dbg attachments must agree.
+  DILocalVariable *Var = DPV.getVariable();
+  DILocation *Loc = DPV.getDebugLoc();
+  CheckDI(Loc, "missing #dbg record DILocation", &DPV, BB, F);
+
+  DISubprogram *VarSP = getSubprogram(Var->getRawScope());
+  DISubprogram *LocSP = getSubprogram(Loc->getRawScope());
+  if (!VarSP || !LocSP)
+    return; // Broken scope chains are checked elsewhere.
+
+  CheckDI(VarSP == LocSP,
+          "mismatched subprogram between #dbg record variable and DILocation",
+          &DPV, BB, F, Var, Var->getScope()->getSubprogram(), Loc,
+          Loc->getScope()->getSubprogram());
+
+  // This check is redundant with one in visitLocalVariable().
+  CheckDI(isType(Var->getRawType()), "invalid type ref", Var,
+          Var->getRawType());
+}
+
 void Verifier::visitVPIntrinsic(VPIntrinsic &VPI) {
   if (auto *VPCast = dyn_cast<VPCastIntrinsic>(&VPI)) {
     auto *RetTy = cast<VectorType>(VPCast->getType());
diff --git a/llvm/lib/IRPrinter/IRPrintingPasses.cpp b/llvm/lib/IRPrinter/IRPrintingPasses.cpp
index 52b242b4dcd5..026fa4d746d8 100644
--- a/llvm/lib/IRPrinter/IRPrintingPasses.cpp
+++ b/llvm/lib/IRPrinter/IRPrintingPasses.cpp
@@ -22,6 +22,8 @@
 
 using namespace llvm;
 
+extern cl::opt<bool> WriteNewDbgInfoFormat;
+
 PrintModulePass::PrintModulePass() : OS(dbgs()) {}
 PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner,
                                  bool ShouldPreserveUseListOrder,
@@ -31,11 +33,14 @@ PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner,
       EmitSummaryIndex(EmitSummaryIndex) {}
 
 PreservedAnalyses PrintModulePass::run(Module &M, ModuleAnalysisManager &AM) {
-  // RemoveDIs: there's no textual representation of the DPValue debug-info,
-  // convert to dbg.values before writing out.
-  bool ShouldConvert = M.IsNewDbgInfoFormat;
-  if (ShouldConvert)
-    M.convertFromNewDbgValues();
+  // RemoveDIs: Regardless of the format we've processed this module in, use
+  // `WriteNewDbgInfoFormat` to determine which format we use to write it.
+  ScopedDbgInfoFormatSetter FormatSetter(M, WriteNewDbgInfoFormat);
+  // Remove intrinsic declarations when printing in the new format.
+  // TODO: Move this into Module::setIsNewDbgInfoFormat when we're ready to
+  // update test output.
+  if (WriteNewDbgInfoFormat)
+    M.removeDebugIntrinsicDeclarations();
 
   if (llvm::isFunctionInPrintList("*")) {
     if (!Banner.empty())
@@ -63,9 +68,6 @@ PreservedAnalyses PrintModulePass::run(Module &M, ModuleAnalysisManager &AM) {
     Index->print(OS);
   }
 
-  if (ShouldConvert)
-    M.convertToNewDbgValues();
-
   return PreservedAnalyses::all();
 }
 
@@ -75,11 +77,9 @@ PrintFunctionPass::PrintFunctionPass(raw_ostream &OS, const std::string &Banner)
 
 PreservedAnalyses PrintFunctionPass::run(Function &F,
                                          FunctionAnalysisManager &) {
-  // RemoveDIs: there's no textual representation of the DPValue debug-info,
-  // convert to dbg.values before writing out.
-  bool ShouldConvert = F.IsNewDbgInfoFormat;
-  if (ShouldConvert)
-    F.convertFromNewDbgValues();
+  // RemoveDIs: Regardless of the format we've processed this function in, use
+  // `WriteNewDbgInfoFormat` to determine which format we use to write it.
+  ScopedDbgInfoFormatSetter FormatSetter(F, WriteNewDbgInfoFormat);
 
   if (isFunctionInPrintList(F.getName())) {
     if (forcePrintModuleIR())
@@ -88,8 +88,5 @@ PreservedAnalyses PrintFunctionPass::run(Function &F,
       OS << Banner << '\n' << static_cast<Value &>(F);
   }
 
-  if (ShouldConvert)
-    F.convertToNewDbgValues();
-
   return PreservedAnalyses::all();
 }
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index b5062580169d..34a49c8588b2 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1557,6 +1557,21 @@ ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism,
       };
 }
 
+StringLiteral lto::getThinLTODefaultCPU(const Triple &TheTriple) {
+  if (!TheTriple.isOSDarwin())
+    return "";
+  if (TheTriple.getArch() == Triple::x86_64)
+    return "core2";
+  if (TheTriple.getArch() == Triple::x86)
+    return "yonah";
+  if (TheTriple.isArm64e())
+    return "apple-a12";
+  if (TheTriple.getArch() == Triple::aarch64 ||
+      TheTriple.getArch() == Triple::aarch64_32)
+    return "cyclone";
+  return "";
+}
+
 // Given the original \p Path to an output file, replace any path
 // prefix matching \p OldPrefix with \p NewPrefix. Also, create the
 // resulting directory if it does not yet exist.
diff --git a/llvm/lib/LTO/LTOCodeGenerator.cpp b/llvm/lib/LTO/LTOCodeGenerator.cpp
index 52d8fff14be9..19b6f7e78792 100644
--- a/llvm/lib/LTO/LTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/LTOCodeGenerator.cpp
@@ -409,18 +409,8 @@ bool LTOCodeGenerator::determineTarget() {
   SubtargetFeatures Features(join(Config.MAttrs, ""));
   Features.getDefaultSubtargetFeatures(Triple);
   FeatureStr = Features.getString();
-  // Set a default CPU for Darwin triples.
-  if (Config.CPU.empty() && Triple.isOSDarwin()) {
-    if (Triple.getArch() == llvm::Triple::x86_64)
-      Config.CPU = "core2";
-    else if (Triple.getArch() == llvm::Triple::x86)
-      Config.CPU = "yonah";
-    else if (Triple.isArm64e())
-      Config.CPU = "apple-a12";
-    else if (Triple.getArch() == llvm::Triple::aarch64 ||
-             Triple.getArch() == llvm::Triple::aarch64_32)
-      Config.CPU = "cyclone";
-  }
+  if (Config.CPU.empty())
+    Config.CPU = lto::getThinLTODefaultCPU(Triple);
 
   // If data-sections is not explicitly set or unset, set data-sections by
   // default to match the behaviour of lld and gold plugin.
diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index 535faf5f7804..8fd181846f0c 100644
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -539,17 +539,8 @@ static void resolvePrevailingInIndex(
 // Initialize the TargetMachine builder for a given Triple
 static void initTMBuilder(TargetMachineBuilder &TMBuilder,
                           const Triple &TheTriple) {
-  // Set a default CPU for Darwin triples (copied from LTOCodeGenerator).
-  // FIXME this looks pretty terrible...
-  if (TMBuilder.MCpu.empty() && TheTriple.isOSDarwin()) {
-    if (TheTriple.getArch() == llvm::Triple::x86_64)
-      TMBuilder.MCpu = "core2";
-    else if (TheTriple.getArch() == llvm::Triple::x86)
-      TMBuilder.MCpu = "yonah";
-    else if (TheTriple.getArch() == llvm::Triple::aarch64 ||
-             TheTriple.getArch() == llvm::Triple::aarch64_32)
-      TMBuilder.MCpu = "cyclone";
-  }
+  if (TMBuilder.MCpu.empty())
+    TMBuilder.MCpu = lto::getThinLTODefaultCPU(TheTriple);
   TMBuilder.TheTriple = std::move(TheTriple);
 }
 
diff --git a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
index 1b3a58298ec0..f52bcb74938d 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
+++ b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
@@ -300,6 +300,10 @@ static Error updateAndRemoveSymbols(const CommonConfig &Config,
          Config.SymbolsToLocalize.matches(Sym.Name)))
       Sym.Binding = STB_LOCAL;
 
+    for (auto &[Matcher, Visibility] : ELFConfig.SymbolsToSetVisibility)
+      if (Matcher.matches(Sym.Name))
+        Sym.Visibility = Visibility;
+
     // Note: these two globalize flags have very similar names but different
     // meanings:
     //
diff --git a/llvm/lib/Object/Archive.cpp b/llvm/lib/Object/Archive.cpp
index e447e5b23316..9000e9aa81ff 100644
--- a/llvm/lib/Object/Archive.cpp
+++ b/llvm/lib/Object/Archive.cpp
@@ -969,8 +969,8 @@ Archive::Archive(MemoryBufferRef Source, Error &Err)
   Err = Error::success();
 }
 
-object::Archive::Kind Archive::getDefaultKindForHost() {
-  Triple HostTriple(sys::getProcessTriple());
+object::Archive::Kind Archive::getDefaultKind() {
+  Triple HostTriple(sys::getDefaultTargetTriple());
   return HostTriple.isOSDarwin()
              ? object::Archive::K_DARWIN
              : (HostTriple.isOSAIX() ? object::Archive::K_AIXBIG
diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp
index 155926a8c594..96e4ec1ee0b7 100644
--- a/llvm/lib/Object/ArchiveWriter.cpp
+++ b/llvm/lib/Object/ArchiveWriter.cpp
@@ -90,7 +90,7 @@ object::Archive::Kind NewArchiveMember::detectKindFromObject() const {
     }
   }
 
-  return object::Archive::getDefaultKindForHost();
+  return object::Archive::getDefaultKind();
 }
 
 Expected<NewArchiveMember>
diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp
index 0401c20b98ec..935749afe338 100644
--- a/llvm/lib/Object/DXContainer.cpp
+++ b/llvm/lib/Object/DXContainer.cpp
@@ -72,13 +72,13 @@ Error DXContainer::parseDXILHeader(StringRef Part) {
   return Error::success();
 }
 
-Error DXContainer::parseShaderFlags(StringRef Part) {
-  if (ShaderFlags)
+Error DXContainer::parseShaderFeatureFlags(StringRef Part) {
+  if (ShaderFeatureFlags)
     return parseFailed("More than one SFI0 part is present in the file");
   uint64_t FlagValue = 0;
   if (Error Err = readInteger(Part, Part.begin(), FlagValue))
     return Err;
-  ShaderFlags = FlagValue;
+  ShaderFeatureFlags = FlagValue;
   return Error::success();
 }
 
@@ -168,7 +168,7 @@ Error DXContainer::parsePartOffsets() {
         return Err;
       break;
     case dxbc::PartType::SFI0:
-      if (Error Err = parseShaderFlags(PartData))
+      if (Error Err = parseShaderFeatureFlags(PartData))
         return Err;
       break;
     case dxbc::PartType::HASH:
diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
index 1f03f2c7d399..7dc9822bdd22 100644
--- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp
+++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
@@ -23,15 +23,15 @@ namespace llvm {
 static_assert((uint64_t)dxbc::FeatureFlags::NextUnusedBit <= 1ull << 63,
               "Shader flag bits exceed enum size.");
 
-DXContainerYAML::ShaderFlags::ShaderFlags(uint64_t FlagData) {
-#define SHADER_FLAG(Num, Val, Str)                                             \
+DXContainerYAML::ShaderFeatureFlags::ShaderFeatureFlags(uint64_t FlagData) {
+#define SHADER_FEATURE_FLAG(Num, Val, Str)                                     \
   Val = (FlagData & (uint64_t)dxbc::FeatureFlags::Val) > 0;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 }
 
-uint64_t DXContainerYAML::ShaderFlags::getEncodedFlags() {
+uint64_t DXContainerYAML::ShaderFeatureFlags::getEncodedFlags() {
   uint64_t Flag = 0;
-#define SHADER_FLAG(Num, Val, Str)                                             \
+#define SHADER_FEATURE_FLAG(Num, Val, Str)                                     \
   if (Val)                                                                     \
     Flag |= (uint64_t)dxbc::FeatureFlags::Val;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
@@ -103,9 +103,9 @@ void MappingTraits<DXContainerYAML::DXILProgram>::mapping(
   IO.mapOptional("DXIL", Program.DXIL);
 }
 
-void MappingTraits<DXContainerYAML::ShaderFlags>::mapping(
-    IO &IO, DXContainerYAML::ShaderFlags &Flags) {
-#define SHADER_FLAG(Num, Val, Str) IO.mapRequired(#Val, Flags.Val);
+void MappingTraits<DXContainerYAML::ShaderFeatureFlags>::mapping(
+    IO &IO, DXContainerYAML::ShaderFeatureFlags &Flags) {
+#define SHADER_FEATURE_FLAG(Num, Val, Str) IO.mapRequired(#Val, Flags.Val);
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 }
 
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index 334f5dce879e..6c77ce017c03 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -370,9 +370,16 @@ class MCDCRecordProcessor : NextIDsBuilder, mcdc::TVIdxBuilder {
   /// Mapping of calculated MC/DC Independence Pairs for each condition.
   MCDCRecord::TVPairMap IndependencePairs;
 
+  /// Storage for ExecVectors
+  /// ExecVectors is the alias of its 0th element.
+  std::array<MCDCRecord::TestVectors, 2> ExecVectorsByCond;
+
   /// Actual executed Test Vectors for the boolean expression, based on
   /// ExecutedTestVectorBitmap.
-  MCDCRecord::TestVectors ExecVectors;
+  MCDCRecord::TestVectors &ExecVectors;
+
+  /// Number of False items in ExecVectors
+  unsigned NumExecVectorsF;
 
 #ifndef NDEBUG
   DenseSet<unsigned> TVIdxs;
@@ -385,45 +392,41 @@ public:
       : NextIDsBuilder(Branches), TVIdxBuilder(this->NextIDs), Bitmap(Bitmap),
         Region(Region), DecisionParams(Region.getDecisionParams()),
         Branches(Branches), NumConditions(DecisionParams.NumConditions),
-        Folded(NumConditions, false), IndependencePairs(NumConditions) {}
+        Folded(NumConditions, false), IndependencePairs(NumConditions),
+        ExecVectors(ExecVectorsByCond[false]) {}
 
 private:
   // Walk the binary decision diagram and try assigning both false and true to
   // each node. When a terminal node (ID == 0) is reached, fill in the value in
   // the truth table.
   void buildTestVector(MCDCRecord::TestVector &TV, mcdc::ConditionID ID,
-                       int TVIdx, unsigned Index) {
-    assert((Index & (1 << ID)) == 0);
-
+                       int TVIdx) {
     for (auto MCDCCond : {MCDCRecord::MCDC_False, MCDCRecord::MCDC_True}) {
       static_assert(MCDCRecord::MCDC_False == 0);
       static_assert(MCDCRecord::MCDC_True == 1);
-      Index |= MCDCCond << ID;
-      TV[ID] = MCDCCond;
+      TV.set(ID, MCDCCond);
       auto NextID = NextIDs[ID][MCDCCond];
       auto NextTVIdx = TVIdx + Indices[ID][MCDCCond];
       assert(NextID == SavedNodes[ID].NextIDs[MCDCCond]);
       if (NextID >= 0) {
-        buildTestVector(TV, NextID, NextTVIdx, Index);
+        buildTestVector(TV, NextID, NextTVIdx);
         continue;
       }
 
       assert(TVIdx < SavedNodes[ID].Width);
       assert(TVIdxs.insert(NextTVIdx).second && "Duplicate TVIdx");
 
-      if (!Bitmap[DecisionParams.BitmapIdx * CHAR_BIT + Index])
+      if (!Bitmap[DecisionParams.BitmapIdx * CHAR_BIT + TV.getIndex()])
         continue;
 
       // Copy the completed test vector to the vector of testvectors.
-      ExecVectors.push_back(TV);
-
       // The final value (T,F) is equal to the last non-dontcare state on the
       // path (in a short-circuiting system).
-      ExecVectors.back().push_back(MCDCCond);
+      ExecVectorsByCond[MCDCCond].push_back({TV, MCDCCond});
     }
 
     // Reset back to DontCare.
-    TV[ID] = MCDCRecord::MCDC_DontCare;
+    TV.set(ID, MCDCRecord::MCDC_DontCare);
   }
 
   /// Walk the bits in the bitmap.  A bit set to '1' indicates that the test
@@ -433,10 +436,18 @@ private:
     // We start at the root node (ID == 0) with all values being DontCare.
     // `TVIdx` starts with 0 and is in the traversal.
     // `Index` encodes the bitmask of true values and is initially 0.
-    MCDCRecord::TestVector TV(NumConditions, MCDCRecord::MCDC_DontCare);
-    buildTestVector(TV, 0, 0, 0);
+    MCDCRecord::TestVector TV(NumConditions);
+    buildTestVector(TV, 0, 0);
     assert(TVIdxs.size() == unsigned(NumTestVectors) &&
            "TVIdxs wasn't fulfilled");
+
+    // Fill ExecVectors order by False items and True items.
+    // ExecVectors is the alias of ExecVectorsByCond[false], so
+    // Append ExecVectorsByCond[true] on it.
+    NumExecVectorsF = ExecVectors.size();
+    auto &ExecVectorsT = ExecVectorsByCond[true];
+    ExecVectors.append(std::make_move_iterator(ExecVectorsT.begin()),
+                       std::make_move_iterator(ExecVectorsT.end()));
   }
 
   // Find an independence pair for each condition:
@@ -445,27 +456,18 @@ private:
   // - All other conditions' values must be equal or marked as "don't care".
   void findIndependencePairs() {
     unsigned NumTVs = ExecVectors.size();
-    for (unsigned I = 1; I < NumTVs; ++I) {
-      const MCDCRecord::TestVector &A = ExecVectors[I];
-      for (unsigned J = 0; J < I; ++J) {
-        const MCDCRecord::TestVector &B = ExecVectors[J];
-        // Enumerate two execution vectors whose outcomes are different.
-        if (A[NumConditions] == B[NumConditions])
-          continue;
-        unsigned Flip = NumConditions, Idx;
-        for (Idx = 0; Idx < NumConditions; ++Idx) {
-          MCDCRecord::CondState ACond = A[Idx], BCond = B[Idx];
-          if (ACond == BCond || ACond == MCDCRecord::MCDC_DontCare ||
-              BCond == MCDCRecord::MCDC_DontCare)
-            continue;
-          if (Flip != NumConditions)
-            break;
-          Flip = Idx;
-        }
+    for (unsigned I = NumExecVectorsF; I < NumTVs; ++I) {
+      const auto &[A, ACond] = ExecVectors[I];
+      assert(ACond == MCDCRecord::MCDC_True);
+      for (unsigned J = 0; J < NumExecVectorsF; ++J) {
+        const auto &[B, BCond] = ExecVectors[J];
+        assert(BCond == MCDCRecord::MCDC_False);
         // If the two vectors differ in exactly one condition, ignoring DontCare
         // conditions, we have found an independence pair.
-        if (Idx == NumConditions && Flip != NumConditions)
-          IndependencePairs.insert({Flip, std::make_pair(J + 1, I + 1)});
+        auto AB = A.getDifferences(B);
+        if (AB.count() == 1)
+          IndependencePairs.insert(
+              {AB.find_first(), std::make_pair(J + 1, I + 1)});
       }
     }
   }
@@ -860,7 +862,8 @@ Error CoverageMapping::loadFunctionRecord(
       consumeError(std::move(E));
       return Error::success();
     }
-    Function.pushRegion(Region, *ExecutionCount, *AltExecutionCount);
+    Function.pushRegion(Region, *ExecutionCount, *AltExecutionCount,
+                        ProfileReader.hasSingleByteCoverage());
 
     // Record ExpansionRegion.
     if (Region.Kind == CounterMappingRegion::ExpansionRegion) {
@@ -1282,8 +1285,14 @@ class SegmentBuilder {
       // value for that area.
       // We add counts of the regions of the same kind as the active region
       // to handle the both situations.
-      if (I->Kind == Active->Kind)
-        Active->ExecutionCount += I->ExecutionCount;
+      if (I->Kind == Active->Kind) {
+        assert(I->HasSingleByteCoverage == Active->HasSingleByteCoverage &&
+               "Regions are generated in different coverage modes");
+        if (I->HasSingleByteCoverage)
+          Active->ExecutionCount = Active->ExecutionCount || I->ExecutionCount;
+        else
+          Active->ExecutionCount += I->ExecutionCount;
+      }
     }
     return Regions.drop_back(std::distance(++Active, End));
   }
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 2eeeff987399..b9afee413853 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -1533,9 +1533,12 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
     // When a new field is added in the header add a case statement here to
     // populate it.
     static_assert(
-        IndexedInstrProf::ProfVersion::CurrentVersion == Version11,
+        IndexedInstrProf::ProfVersion::CurrentVersion == Version12,
         "Please update the reading code below if a new field has been added, "
         "if not add a case statement to fall through to the latest version.");
+  case 12ull:
+    H.VTableNamesOffset = read(Buffer, offsetOf(&Header::VTableNamesOffset));
+    [[fallthrough]];
   case 11ull:
     [[fallthrough]];
   case 10ull:
@@ -1561,10 +1564,14 @@ size_t Header::size() const {
     // When a new field is added to the header add a case statement here to
     // compute the size as offset of the new field + size of the new field. This
     // relies on the field being added to the end of the list.
-    static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version11,
+    static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version12,
                   "Please update the size computation below if a new field has "
                   "been added to the header, if not add a case statement to "
                   "fall through to the latest version.");
+  case 12ull:
+    return offsetOf(&Header::VTableNamesOffset) +
+           sizeof(Header::VTableNamesOffset);
+    [[fallthrough]];
   case 11ull:
     [[fallthrough]];
   case 10ull:
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 0d8d43daae96..31b742bca14d 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -366,6 +366,11 @@ TextInstrProfReader::readValueProfileData(InstrProfRecord &Record) {
               return E;
             Value = IndexedInstrProf::ComputeHash(VD.first);
           }
+        } else if (ValueKind == IPVK_VTableTarget) {
+          if (InstrProfSymtab::isExternalSymbol(VD.first))
+            Value = 0;
+          else
+            Value = IndexedInstrProf::ComputeHash(VD.first);
         } else {
           READ_NUM(VD.first, Value);
         }
@@ -582,10 +587,17 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
   auto NumBitmapBytes = swap(Header.NumBitmapBytes);
   auto PaddingBytesAfterBitmapBytes = swap(Header.PaddingBytesAfterBitmapBytes);
   auto NamesSize = swap(Header.NamesSize);
+  auto VTableNameSize = swap(Header.VNamesSize);
+  auto NumVTables = swap(Header.NumVTables);
   ValueKindLast = swap(Header.ValueKindLast);
 
   auto DataSize = NumData * sizeof(RawInstrProf::ProfileData<IntPtrT>);
-  auto PaddingSize = getNumPaddingBytes(NamesSize);
+  auto PaddingBytesAfterNames = getNumPaddingBytes(NamesSize);
+  auto PaddingBytesAfterVTableNames = getNumPaddingBytes(VTableNameSize);
+
+  auto VTableSectionSize =
+      NumVTables * sizeof(RawInstrProf::VTableProfileData<IntPtrT>);
+  auto PaddingBytesAfterVTableProfData = getNumPaddingBytes(VTableSectionSize);
 
   // Profile data starts after profile header and binary ids if exist.
   ptrdiff_t DataOffset = sizeof(RawInstrProf::Header) + BinaryIdSize;
@@ -594,7 +606,12 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
       CountersOffset + CountersSize + PaddingBytesAfterCounters;
   ptrdiff_t NamesOffset =
       BitmapOffset + NumBitmapBytes + PaddingBytesAfterBitmapBytes;
-  ptrdiff_t ValueDataOffset = NamesOffset + NamesSize + PaddingSize;
+  ptrdiff_t VTableProfDataOffset =
+      NamesOffset + NamesSize + PaddingBytesAfterNames;
+  ptrdiff_t VTableNameOffset = VTableProfDataOffset + VTableSectionSize +
+                               PaddingBytesAfterVTableProfData;
+  ptrdiff_t ValueDataOffset =
+      VTableNameOffset + VTableNameSize + PaddingBytesAfterVTableNames;
 
   auto *Start = reinterpret_cast<const char *>(&Header);
   if (Start + ValueDataOffset > DataBuffer->getBufferEnd())
@@ -614,8 +631,14 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
     Data = reinterpret_cast<const RawInstrProf::ProfileData<IntPtrT> *>(
         Start + DataOffset);
     DataEnd = Data + NumData;
+    VTableBegin =
+        reinterpret_cast<const RawInstrProf::VTableProfileData<IntPtrT> *>(
+            Start + VTableProfDataOffset);
+    VTableEnd = VTableBegin + NumVTables;
     NamesStart = Start + NamesOffset;
     NamesEnd = NamesStart + NamesSize;
+    VNamesStart = Start + VTableNameOffset;
+    VNamesEnd = VNamesStart + VTableNameSize;
   }
 
   CountersStart = Start + CountersOffset;
@@ -1260,6 +1283,23 @@ Error IndexedInstrProfReader::readHeader() {
                                         "corrupted binary ids");
   }
 
+  if (GET_VERSION(Header->formatVersion()) >= 12) {
+    uint64_t VTableNamesOffset =
+        endian::byte_swap<uint64_t, llvm::endianness::little>(
+            Header->VTableNamesOffset);
+    const unsigned char *Ptr = Start + VTableNamesOffset;
+
+    CompressedVTableNamesLen =
+        support::endian::readNext<uint64_t, llvm::endianness::little,
+                                  unaligned>(Ptr);
+
+    // Writer first writes the length of compressed string, and then the actual
+    // content.
+    VTableNamePtr = (const char *)Ptr;
+    if (VTableNamePtr > (const char *)DataBuffer->getBufferEnd())
+      return make_error<InstrProfError>(instrprof_error::truncated);
+  }
+
   if (GET_VERSION(Header->formatVersion()) >= 10 &&
       Header->formatVersion() & VARIANT_MASK_TEMPORAL_PROF) {
     uint64_t TemporalProfTracesOffset =
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index d65f8fe50313..3e0a0e0d7011 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -455,12 +455,11 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   Header.MemProfOffset = 0;
   Header.BinaryIdOffset = 0;
   Header.TemporalProfTracesOffset = 0;
-  int N = sizeof(IndexedInstrProf::Header) / sizeof(uint64_t);
+  Header.VTableNamesOffset = 0;
 
-  // Only write out all the fields except 'HashOffset', 'MemProfOffset',
-  // 'BinaryIdOffset' and `TemporalProfTracesOffset`. We need to remember the
-  // offset of these fields to allow back patching later.
-  for (int I = 0; I < N - 4; I++)
+  // Only write out the first four fields. We need to remember the offset of the
+  // remaining fields to allow back patching later.
+  for (int I = 0; I < 4; I++)
     OS.write(reinterpret_cast<uint64_t *>(&Header)[I]);
 
   // Save the location of Header.HashOffset field in \c OS.
@@ -484,6 +483,9 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   uint64_t TemporalProfTracesOffset = OS.tell();
   OS.write(0);
 
+  uint64_t VTableNamesOffset = OS.tell();
+  OS.write(0);
+
   // Reserve space to write profile summary data.
   uint32_t NumEntries = ProfileSummaryBuilder::DefaultCutoffs.size();
   uint32_t SummarySize = Summary::getSize(Summary::NumKinds, NumEntries);
@@ -604,6 +606,31 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
       OS.writeByte(0);
   }
 
+  uint64_t VTableNamesSectionStart = OS.tell();
+
+  // Use a dummy (and uncompressed) string as compressed vtable names and get
+  // the necessary profile format change in place for version 12.
+  // TODO: Store the list of vtable names in InstrProfWriter and use the
+  // real compressed name.
+  std::string CompressedVTableNames = "VTableNames";
+
+  uint64_t CompressedStringLen = CompressedVTableNames.length();
+
+  // Record the length of compressed string.
+  OS.write(CompressedStringLen);
+
+  // Write the chars in compressed strings.
+  for (auto &c : CompressedVTableNames)
+    OS.writeByte(static_cast<uint8_t>(c));
+
+  // Pad up to a multiple of 8.
+  // InstrProfReader would read bytes according to 'CompressedStringLen'.
+  uint64_t PaddedLength = alignTo(CompressedStringLen, 8);
+
+  for (uint64_t K = CompressedStringLen; K < PaddedLength; K++) {
+    OS.writeByte(0);
+  }
+
   uint64_t TemporalProfTracesSectionStart = 0;
   if (static_cast<bool>(ProfileKind & InstrProfKind::TemporalProfile)) {
     TemporalProfTracesSectionStart = OS.tell();
@@ -647,6 +674,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
       // Patch the Header.TemporalProfTracesOffset (=0 for profiles without
       // traces).
       {TemporalProfTracesOffset, &TemporalProfTracesSectionStart, 1},
+      {VTableNamesOffset, &VTableNamesSectionStart, 1},
       // Patch the summary data.
       {SummaryOffset, reinterpret_cast<uint64_t *>(TheSummary.get()),
        (int)(SummarySize / sizeof(uint64_t))},
@@ -699,7 +727,8 @@ Error InstrProfWriter::validateRecord(const InstrProfRecord &Func) {
       std::unique_ptr<InstrProfValueData[]> VD = Func.getValueForSite(VK, S);
       DenseSet<uint64_t> SeenValues;
       for (uint32_t I = 0; I < ND; I++)
-        if ((VK != IPVK_IndirectCallTarget) && !SeenValues.insert(VD[I].Value).second)
+        if ((VK != IPVK_IndirectCallTarget && VK != IPVK_VTableTarget) &&
+            !SeenValues.insert(VD[I].Value).second)
           return make_error<InstrProfError>(instrprof_error::invalid_prof);
     }
   }
@@ -747,7 +776,7 @@ void InstrProfWriter::writeRecordInText(StringRef Name, uint64_t Hash,
       OS << ND << "\n";
       std::unique_ptr<InstrProfValueData[]> VD = Func.getValueForSite(VK, S);
       for (uint32_t I = 0; I < ND; I++) {
-        if (VK == IPVK_IndirectCallTarget)
+        if (VK == IPVK_IndirectCallTarget || VK == IPVK_VTableTarget)
           OS << Symtab.getFuncOrVarNameIfDefined(VD[I].Value) << ":"
              << VD[I].Count << "\n";
         else
diff --git a/llvm/lib/Support/BlockFrequency.cpp b/llvm/lib/Support/BlockFrequency.cpp
index 329f1e12cdc2..7d5498e7cb99 100644
--- a/llvm/lib/Support/BlockFrequency.cpp
+++ b/llvm/lib/Support/BlockFrequency.cpp
@@ -13,6 +13,8 @@
 #include "llvm/Support/BlockFrequency.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ScaledNumber.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -45,3 +47,18 @@ std::optional<BlockFrequency> BlockFrequency::mul(uint64_t Factor) const {
     return {};
   return BlockFrequency(ResultFrequency);
 }
+
+void llvm::printRelativeBlockFreq(raw_ostream &OS, BlockFrequency EntryFreq,
+                                  BlockFrequency Freq) {
+  if (Freq == BlockFrequency(0)) {
+    OS << "0";
+    return;
+  }
+  if (EntryFreq == BlockFrequency(0)) {
+    OS << "<invalid BFI>";
+    return;
+  }
+  ScaledNumber<uint64_t> Block(Freq.getFrequency(), 0);
+  ScaledNumber<uint64_t> Entry(EntryFreq.getFrequency(), 0);
+  OS << Block / Entry;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index a99856dcc943..f147ded2ab70 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -119,8 +119,8 @@ void AArch64Arm64ECCallLowering::getThunkType(FunctionType *FT,
   getThunkArgTypes(FT, AttrList, TT, Out, Arm64ArgTypes, X64ArgTypes,
                    HasSretPtr);
 
-  Arm64Ty = FunctionType::get(Arm64RetTy, Arm64ArgTypes,
-                              TT == ThunkType::Entry && FT->isVarArg());
+  Arm64Ty = FunctionType::get(Arm64RetTy, Arm64ArgTypes, false);
+
   X64Ty = FunctionType::get(X64RetTy, X64ArgTypes, false);
 }
 
@@ -158,13 +158,13 @@ void AArch64Arm64ECCallLowering::getThunkArgTypes(
       X64ArgTypes.push_back(I64Ty);
     }
 
+    // x4
+    Arm64ArgTypes.push_back(PtrTy);
+    X64ArgTypes.push_back(PtrTy);
+    // x5
+    Arm64ArgTypes.push_back(I64Ty);
     if (TT != ThunkType::Entry) {
-      // x4
-      Arm64ArgTypes.push_back(PtrTy);
-      X64ArgTypes.push_back(PtrTy);
-      // x5
-      Arm64ArgTypes.push_back(I64Ty);
-      // FIXME: x5 isn't actually passed/used by the x64 side; revisit once we
+      // FIXME: x5 isn't actually used by the x64 side; revisit once we
       // have proper isel for varargs
       X64ArgTypes.push_back(I64Ty);
     }
@@ -473,10 +473,11 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) {
 
   bool TransformDirectToSRet = X64RetType->isVoidTy() && !RetTy->isVoidTy();
   unsigned ThunkArgOffset = TransformDirectToSRet ? 2 : 1;
+  unsigned PassthroughArgSize = F->isVarArg() ? 5 : Thunk->arg_size();
 
   // Translate arguments to call.
   SmallVector<Value *> Args;
-  for (unsigned i = ThunkArgOffset, e = Thunk->arg_size(); i != e; ++i) {
+  for (unsigned i = ThunkArgOffset, e = PassthroughArgSize; i != e; ++i) {
     Value *Arg = Thunk->getArg(i);
     Type *ArgTy = Arm64Ty->getParamType(i - ThunkArgOffset);
     if (ArgTy->isArrayTy() || ArgTy->isStructTy() ||
@@ -493,6 +494,22 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) {
     Args.push_back(Arg);
   }
 
+  if (F->isVarArg()) {
+    // The 5th argument to variadic entry thunks is used to model the x64 sp
+    // which is passed to the thunk in x4, this can be passed to the callee as
+    // the variadic argument start address after skipping over the 32 byte
+    // shadow store.
+
+    // The EC thunk CC will assign any argument marked as InReg to x4.
+    Thunk->addParamAttr(5, Attribute::InReg);
+    Value *Arg = Thunk->getArg(5);
+    Arg = IRB.CreatePtrAdd(Arg, IRB.getInt64(0x20));
+    Args.push_back(Arg);
+
+    // Pass in a zero variadic argument size (in x5).
+    Args.push_back(IRB.getInt64(0));
+  }
+
   // Call the function passed to the thunk.
   Value *Callee = Thunk->getArg(0);
   Callee = IRB.CreateBitCast(Callee, PtrTy);
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 78ea4a5180f7..8e67f0f5c881 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -213,6 +213,9 @@ def CC_AArch64_Arm64EC_VarArg : CallingConv<[
 // address is passed in X9.
 let Entry = 1 in
 def CC_AArch64_Arm64EC_Thunk : CallingConv<[
+  // ARM64EC-specific: the InReg attribute can be used to access the x64 sp passed into entry thunks in x4 from the IR.
+  CCIfInReg<CCIfType<[i64], CCAssignToReg<[X4]>>>,
+
   // Byval aggregates are passed by pointer
   CCIfByVal<CCPassIndirect<i64>>,
 
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 635beeed0df8..49bcab588e52 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -5179,7 +5179,8 @@ FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo,
                                         const TargetLibraryInfo *LibInfo) {
 
   SMEAttrs CallerAttrs(*FuncInfo.Fn);
-  if (CallerAttrs.hasZAState() || CallerAttrs.hasStreamingInterfaceOrBody() ||
+  if (CallerAttrs.hasZAState() || CallerAttrs.hasZT0State() ||
+      CallerAttrs.hasStreamingInterfaceOrBody() ||
       CallerAttrs.hasStreamingCompatibleInterface())
     return nullptr;
   return new AArch64FastISel(FuncInfo, LibInfo);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3b92e95d7c28..c21bc3a4abbc 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -271,11 +271,9 @@ static bool isMergePassthruOpcode(unsigned Opc) {
 static bool isZeroingInactiveLanes(SDValue Op) {
   switch (Op.getOpcode()) {
   default:
-    // We guarantee i1 splat_vectors to zero the other lanes by
-    // implementing it with ptrue and possibly a punpklo for nxv1i1.
-    if (ISD::isConstantSplatVectorAllOnes(Op.getNode()))
-      return true;
     return false;
+  // We guarantee i1 splat_vectors to zero the other lanes
+  case ISD::SPLAT_VECTOR:
   case AArch64ISD::PTRUE:
   case AArch64ISD::SETCC_MERGE_ZERO:
     return true;
@@ -25892,7 +25890,8 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
     auto CallerAttrs = SMEAttrs(*Inst.getFunction());
     auto CalleeAttrs = SMEAttrs(*Base);
     if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
-        CallerAttrs.requiresLazySave(CalleeAttrs))
+        CallerAttrs.requiresLazySave(CalleeAttrs) ||
+        CallerAttrs.requiresPreservingZT0(CalleeAttrs))
       return true;
   }
   return false;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index e73bc0d89e4c..b01a8cd00025 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6667,31 +6667,29 @@ def : Pat<(vector_extract (v8bf16 V128:$Rn), VectorIndexH:$idx),
 // All concat_vectors operations are canonicalised to act on i64 vectors for
 // AArch64. In the general case we need an instruction, which had just as well be
 // INS.
-class ConcatPat<ValueType DstTy, ValueType SrcTy>
-  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)),
-        (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1,
-                     (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>;
-
-def : ConcatPat<v2i64, v1i64>;
-def : ConcatPat<v2f64, v1f64>;
-def : ConcatPat<v4i32, v2i32>;
-def : ConcatPat<v4f32, v2f32>;
-def : ConcatPat<v8i16, v4i16>;
-def : ConcatPat<v8f16, v4f16>;
-def : ConcatPat<v8bf16, v4bf16>;
-def : ConcatPat<v16i8, v8i8>;
-
-// If the high lanes are undef, though, we can just ignore them:
-class ConcatUndefPat<ValueType DstTy, ValueType SrcTy>
-  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)),
-        (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>;
-
-def : ConcatUndefPat<v2i64, v1i64>;
-def : ConcatUndefPat<v2f64, v1f64>;
-def : ConcatUndefPat<v4i32, v2i32>;
-def : ConcatUndefPat<v4f32, v2f32>;
-def : ConcatUndefPat<v8i16, v4i16>;
-def : ConcatUndefPat<v16i8, v8i8>;
+multiclass ConcatPat<ValueType DstTy, ValueType SrcTy> {
+  def : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)),
+            (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1,
+                         (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>;
+
+  // If the high lanes are zero we can instead emit a d->d register mov, which
+  // will implicitly clear the upper bits.
+  def : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), immAllZerosV)),
+            (SUBREG_TO_REG (i64 0), (FMOVDr V64:$Rn), dsub)>;
+
+  // If the high lanes are undef we can just ignore them:
+  def : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)),
+            (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>;
+}
+
+defm : ConcatPat<v2i64, v1i64>;
+defm : ConcatPat<v2f64, v1f64>;
+defm : ConcatPat<v4i32, v2i32>;
+defm : ConcatPat<v4f32, v2f32>;
+defm : ConcatPat<v8i16, v4i16>;
+defm : ConcatPat<v8f16, v4f16>;
+defm : ConcatPat<v8bf16, v4bf16>;
+defm : ConcatPat<v16i8, v8i8>;
 
 //----------------------------------------------------------------------------
 // AdvSIMD across lanes instructions
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index 87aa3b98d938..6865850cf04f 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -127,6 +127,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
   bool visitINSERT(MachineInstr &MI);
   bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
   bool visitINSvi64lane(MachineInstr &MI);
+  bool visitFMOVDr(MachineInstr &MI);
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   StringRef getPassName() const override {
@@ -670,6 +671,23 @@ bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) {
   return true;
 }
 
+bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) {
+  // An FMOVDr sets the high 64-bits to zero implicitly, similar to ORR for GPR.
+  MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
+  if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
+    return false;
+
+  // Let's remove MIs for high 64-bits.
+  Register OldDef = MI.getOperand(0).getReg();
+  Register NewDef = MI.getOperand(1).getReg();
+  MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));
+  MRI->replaceRegWith(OldDef, NewDef);
+  LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n");
+  MI.eraseFromParent();
+
+  return true;
+}
+
 bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -748,6 +766,9 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
       case AArch64::INSvi64lane:
         Changed |= visitINSvi64lane(MI);
         break;
+      case AArch64::FMOVDr:
+        Changed |= visitFMOVDr(MI);
+        break;
       }
     }
   }
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
index 5163de280f2e..b75264602dbc 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
@@ -309,7 +309,6 @@ def M4WriteFMAC3H  : SchedWriteRes<[M4UnitFMACH]> { let Latency = 3; }
 def M4WriteFMAC3   : SchedWriteRes<[M4UnitFMAC]>  { let Latency = 3; }
 def M4WriteFMAC4   : SchedWriteRes<[M4UnitFMAC]>  { let Latency = 4; }
 def M4WriteFMAC4H  : SchedWriteRes<[M4UnitFMACH]> { let Latency = 4; }
-def M4WriteFMAC5   : SchedWriteRes<[M4UnitFMAC]>  { let Latency = 5; }
 
 def M4WriteFSQR7H  : SchedWriteRes<[M4UnitFSQRH]> { let Latency = 7;
                                                     let ReleaseAtCycles = [6]; }
@@ -495,8 +494,7 @@ def M4WriteMOVI    : SchedWriteVariant<[SchedVar<IsZeroFPIdiomPred, [M4WriteZ0]>
 // Fast forwarding.
 def M4ReadAESM1    : SchedReadAdvance<+1, [M4WriteNCRY1]>;
 def M4ReadFMACM1   : SchedReadAdvance<+1, [M4WriteFMAC4,
-                                           M4WriteFMAC4H,
-                                           M4WriteFMAC5]>;
+                                           M4WriteFMAC4H]>;
 def M4ReadNMULM1   : SchedReadAdvance<+1, [M4WriteNMUL3]>;
 def M4ReadNMULP2   : SchedReadAdvance<-2, [M4WriteNMUL3]>;
 
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
index 2ccbe1614dcd..6b5a6da76b3a 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
@@ -338,7 +338,6 @@ def M5WriteFDIV12  : SchedWriteRes<[M5UnitFDIV]>  { let Latency = 12;
 
 def M5WriteFMAC3   : SchedWriteRes<[M5UnitFMAC]>  { let Latency = 3; }
 def M5WriteFMAC4   : SchedWriteRes<[M5UnitFMAC]>  { let Latency = 4; }
-def M5WriteFMAC5   : SchedWriteRes<[M5UnitFMAC]>  { let Latency = 5; }
 
 def M5WriteFSQR5   : SchedWriteRes<[M5UnitFSQR]>  { let Latency = 5;
                                                     let ReleaseAtCycles = [2]; }
@@ -530,8 +529,7 @@ def M5WriteMOVI    : SchedWriteVariant<[SchedVar<IsZeroFPIdiomPred, [M5WriteZ0]>
 // Fast forwarding.
 def M5ReadFM1      : SchedReadAdvance<+1, [M5WriteF2]>;
 def M5ReadAESM2    : SchedReadAdvance<+2, [M5WriteNCRY2]>;
-def M5ReadFMACM1   : SchedReadAdvance<+1, [M5WriteFMAC4,
-                                           M5WriteFMAC5]>;
+def M5ReadFMACM1   : SchedReadAdvance<+1, [M5WriteFMAC4]>;
 def M5ReadNMULM1   : SchedReadAdvance<+1, [M5WriteNMUL3]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 3dc3d31a34e8..26dbad713594 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -535,7 +535,8 @@ bool AArch64CallLowering::fallBackToDAGISel(const MachineFunction &MF) const {
   }
 
   SMEAttrs Attrs(F);
-  if (Attrs.hasZAState() || Attrs.hasStreamingInterfaceOrBody() ||
+  if (Attrs.hasZAState() || Attrs.hasZT0State() ||
+      Attrs.hasStreamingInterfaceOrBody() ||
       Attrs.hasStreamingCompatibleInterface())
     return true;
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 60e046bc6cf4..91323e456a5e 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -52,6 +52,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   const LLT v16s8 = LLT::fixed_vector(16, 8);
   const LLT v8s8 = LLT::fixed_vector(8, 8);
   const LLT v4s8 = LLT::fixed_vector(4, 8);
+  const LLT v2s8 = LLT::fixed_vector(2, 8);
   const LLT v8s16 = LLT::fixed_vector(8, 16);
   const LLT v4s16 = LLT::fixed_vector(4, 16);
   const LLT v2s16 = LLT::fixed_vector(2, 16);
@@ -387,8 +388,14 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampMaxNumElements(0, s32, 4)
       .clampMaxNumElements(0, s64, 2)
       .clampMaxNumElements(0, p0, 2)
+      // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
+      .bitcastIf(typeInSet(0, {v4s8}),
+                 [=](const LegalityQuery &Query) {
+                   const LLT VecTy = Query.Types[0];
+                   return std::pair(0, LLT::scalar(VecTy.getSizeInBits()));
+                 })
       .customIf(IsPtrVecPred)
-      .scalarizeIf(typeIs(0, v2s16), 0);
+      .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0);
 
   getActionDefinitionsBuilder(G_STORE)
       .customIf([=](const LegalityQuery &Query) {
@@ -422,8 +429,14 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampMaxNumElements(0, s64, 2)
       .clampMaxNumElements(0, p0, 2)
       .lowerIfMemSizeNotPow2()
+      // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
+      .bitcastIf(typeInSet(0, {v4s8}),
+                 [=](const LegalityQuery &Query) {
+                   const LLT VecTy = Query.Types[0];
+                   return std::pair(0, LLT::scalar(VecTy.getSizeInBits()));
+                 })
       .customIf(IsPtrVecPred)
-      .scalarizeIf(typeIs(0, v2s16), 0);
+      .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0);
 
   getActionDefinitionsBuilder(G_INDEXED_STORE)
       // Idx 0 == Ptr, Idx 1 == Val
@@ -1074,6 +1087,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                  {s16, v8s16},
                  {s32, v2s32},
                  {s32, v4s32}})
+      .moreElementsIf(
+          [=](const LegalityQuery &Query) {
+            return Query.Types[1].isVector() &&
+                   Query.Types[1].getElementType() != s8 &&
+                   Query.Types[1].getNumElements() & 1;
+          },
+          LegalizeMutations::moreElementsToNextPow2(1))
       .clampMaxNumElements(1, s64, 2)
       .clampMaxNumElements(1, s32, 4)
       .clampMaxNumElements(1, s16, 8)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index e3f724850795..57769fe998d1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2337,8 +2337,6 @@ private:
 
   ScheduleDAGMI *DAG;
 
-  std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations;
-
   // Organize lists of SchedGroups by their SyncID. SchedGroups /
   // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
   // between then.
@@ -2381,10 +2379,7 @@ public:
   AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial;
 
   IGroupLPDAGMutation() = default;
-  IGroupLPDAGMutation(
-      AMDGPU::SchedulingPhase Phase,
-      std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations)
-      : SavedMutations(SavedMutations), Phase(Phase) {}
+  IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) {}
 };
 
 unsigned SchedGroup::NumSchedGroups = 0;
@@ -2602,13 +2597,6 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
     PS.solve();
     return;
   }
-
-  if (!SavedMutations)
-    return;
-
-  // We did not apply a mutation, fall back to SavedMutations
-  for (auto &m : *SavedMutations)
-    m->apply(DAG);
 }
 
 void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
@@ -2707,10 +2695,9 @@ namespace llvm {
 /// same scheduling region (e.g. pre and post-RA scheduling / multiple
 /// scheduling "phases"), we can reenter this mutation framework more than once
 /// for a given region.
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(
-    AMDGPU::SchedulingPhase Phase,
-    std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations) {
-  return std::make_unique<IGroupLPDAGMutation>(Phase, SavedMutations);
+std::unique_ptr<ScheduleDAGMutation>
+createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) {
+  return std::make_unique<IGroupLPDAGMutation>(Phase);
 }
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
index 46ef4d702d00..aff7096f26d6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -20,9 +20,8 @@ namespace AMDGPU {
 enum class SchedulingPhase { Initial, PreRAReentry, PostRA };
 } // namespace AMDGPU
 
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(
-    AMDGPU::SchedulingPhase Phase,
-    std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations);
+std::unique_ptr<ScheduleDAGMutation>
+createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase);
 
 } // namespace llvm
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 0d830df1f1f1..76e843455bab 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -461,8 +461,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
   if (ST.shouldClusterStores())
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
-  DAG->addMutation(
-      createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial, nullptr));
+  DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
   return DAG;
@@ -472,8 +471,7 @@ static ScheduleDAGInstrs *
 createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
   ScheduleDAGMILive *DAG =
       new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
-  DAG->addMutation(
-      createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial, nullptr));
+  DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
   return DAG;
 }
 
@@ -937,7 +935,7 @@ public:
       DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
     DAG->addMutation(
-        createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA, nullptr));
+        createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
     if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
       DAG->addMutation(createVOPDPairingMutation());
     return DAG;
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index b7b471d8dc7b..18d51087ff5f 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1685,24 +1685,48 @@ public:
 private:
   struct OperandInfoTy {
     SMLoc Loc;
-    int64_t Id;
+    int64_t Val;
     bool IsSymbolic = false;
     bool IsDefined = false;
 
-    OperandInfoTy(int64_t Id_) : Id(Id_) {}
+    OperandInfoTy(int64_t Val) : Val(Val) {}
   };
 
+  struct StructuredOpField : OperandInfoTy {
+    StringLiteral Id;
+    StringLiteral Desc;
+    unsigned Width;
+    bool IsDefined = false;
+
+    StructuredOpField(StringLiteral Id, StringLiteral Desc, unsigned Width,
+                      int64_t Default)
+        : OperandInfoTy(Default), Id(Id), Desc(Desc), Width(Width) {}
+    virtual ~StructuredOpField() = default;
+
+    bool Error(AMDGPUAsmParser &Parser, const Twine &Err) const {
+      Parser.Error(Loc, "invalid " + Desc + ": " + Err);
+      return false;
+    }
+
+    virtual bool validate(AMDGPUAsmParser &Parser) const {
+      if (IsSymbolic && Val == OPR_ID_UNSUPPORTED)
+        return Error(Parser, "not supported on this GPU");
+      if (!isUIntN(Width, Val))
+        return Error(Parser, "only " + Twine(Width) + "-bit values are legal");
+      return true;
+    }
+  };
+
+  ParseStatus parseStructuredOpFields(ArrayRef<StructuredOpField *> Fields);
+  bool validateStructuredOpFields(ArrayRef<const StructuredOpField *> Fields);
+
   bool parseSendMsgBody(OperandInfoTy &Msg, OperandInfoTy &Op, OperandInfoTy &Stream);
   bool validateSendMsg(const OperandInfoTy &Msg,
                        const OperandInfoTy &Op,
                        const OperandInfoTy &Stream);
 
-  bool parseHwregBody(OperandInfoTy &HwReg,
-                      OperandInfoTy &Offset,
-                      OperandInfoTy &Width);
-  bool validateHwreg(const OperandInfoTy &HwReg,
-                     const OperandInfoTy &Offset,
-                     const OperandInfoTy &Width);
+  ParseStatus parseHwregFunc(OperandInfoTy &HwReg, OperandInfoTy &Offset,
+                             OperandInfoTy &Width);
 
   SMLoc getFlatOffsetLoc(const OperandVector &Operands) const;
   SMLoc getSMEMOffsetLoc(const OperandVector &Operands) const;
@@ -7197,71 +7221,44 @@ bool AMDGPUOperand::isDepCtr() const { return isS16Imm(); }
 // hwreg
 //===----------------------------------------------------------------------===//
 
-bool
-AMDGPUAsmParser::parseHwregBody(OperandInfoTy &HwReg,
-                                OperandInfoTy &Offset,
-                                OperandInfoTy &Width) {
+ParseStatus AMDGPUAsmParser::parseHwregFunc(OperandInfoTy &HwReg,
+                                            OperandInfoTy &Offset,
+                                            OperandInfoTy &Width) {
   using namespace llvm::AMDGPU::Hwreg;
 
+  if (!trySkipId("hwreg", AsmToken::LParen))
+    return ParseStatus::NoMatch;
+
   // The register may be specified by name or using a numeric code
   HwReg.Loc = getLoc();
   if (isToken(AsmToken::Identifier) &&
-      (HwReg.Id = getHwregId(getTokenStr(), getSTI())) != OPR_ID_UNKNOWN) {
+      (HwReg.Val = getHwregId(getTokenStr(), getSTI())) != OPR_ID_UNKNOWN) {
     HwReg.IsSymbolic = true;
     lex(); // skip register name
-  } else if (!parseExpr(HwReg.Id, "a register name")) {
-    return false;
+  } else if (!parseExpr(HwReg.Val, "a register name")) {
+    return ParseStatus::Failure;
   }
 
   if (trySkipToken(AsmToken::RParen))
-    return true;
+    return ParseStatus::Success;
 
   // parse optional params
   if (!skipToken(AsmToken::Comma, "expected a comma or a closing parenthesis"))
-    return false;
+    return ParseStatus::Failure;
 
   Offset.Loc = getLoc();
-  if (!parseExpr(Offset.Id))
-    return false;
+  if (!parseExpr(Offset.Val))
+    return ParseStatus::Failure;
 
   if (!skipToken(AsmToken::Comma, "expected a comma"))
-    return false;
+    return ParseStatus::Failure;
 
   Width.Loc = getLoc();
-  return parseExpr(Width.Id) &&
-         skipToken(AsmToken::RParen, "expected a closing parenthesis");
-}
-
-bool
-AMDGPUAsmParser::validateHwreg(const OperandInfoTy &HwReg,
-                               const OperandInfoTy &Offset,
-                               const OperandInfoTy &Width) {
-
-  using namespace llvm::AMDGPU::Hwreg;
+  if (!parseExpr(Width.Val) ||
+      !skipToken(AsmToken::RParen, "expected a closing parenthesis"))
+    return ParseStatus::Failure;
 
-  if (HwReg.IsSymbolic) {
-    if (HwReg.Id == OPR_ID_UNSUPPORTED) {
-      Error(HwReg.Loc,
-            "specified hardware register is not supported on this GPU");
-      return false;
-    }
-  } else {
-    if (!isValidHwreg(HwReg.Id)) {
-      Error(HwReg.Loc,
-            "invalid code of hardware register: only 6-bit values are legal");
-      return false;
-    }
-  }
-  if (!isValidHwregOffset(Offset.Id)) {
-    Error(Offset.Loc, "invalid bit offset: only 5-bit values are legal");
-    return false;
-  }
-  if (!isValidHwregWidth(Width.Id)) {
-    Error(Width.Loc,
-          "invalid bitfield width: only values from 1 to 32 are legal");
-    return false;
-  }
-  return true;
+  return ParseStatus::Success;
 }
 
 ParseStatus AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
@@ -7270,24 +7267,40 @@ ParseStatus AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
   int64_t ImmVal = 0;
   SMLoc Loc = getLoc();
 
-  if (trySkipId("hwreg", AsmToken::LParen)) {
-    OperandInfoTy HwReg(OPR_ID_UNKNOWN);
-    OperandInfoTy Offset(HwregOffset::Default);
-    OperandInfoTy Width(HwregSize::Default);
-    if (parseHwregBody(HwReg, Offset, Width) &&
-        validateHwreg(HwReg, Offset, Width)) {
-      ImmVal = HwregEncoding::encode(HwReg.Id, Offset.Id, Width.Id);
-    } else {
-      return ParseStatus::Failure;
+  StructuredOpField HwReg("id", "hardware register", HwregId::Width,
+                          HwregId::Default);
+  StructuredOpField Offset("offset", "bit offset", HwregOffset::Width,
+                           HwregOffset::Default);
+  struct : StructuredOpField {
+    using StructuredOpField::StructuredOpField;
+    bool validate(AMDGPUAsmParser &Parser) const override {
+      if (!isUIntN(Width, Val - 1))
+        return Error(Parser, "only values from 1 to 32 are legal");
+      return true;
     }
-  } else if (parseExpr(ImmVal, "a hwreg macro")) {
-    if (ImmVal < 0 || !isUInt<16>(ImmVal))
-      return Error(Loc, "invalid immediate: only 16-bit values are legal");
-  } else {
-    return ParseStatus::Failure;
+  } Width("size", "bitfield width", HwregSize::Width, HwregSize::Default);
+  ParseStatus Res = parseStructuredOpFields({&HwReg, &Offset, &Width});
+
+  if (Res.isNoMatch())
+    Res = parseHwregFunc(HwReg, Offset, Width);
+
+  if (Res.isSuccess()) {
+    if (!validateStructuredOpFields({&HwReg, &Offset, &Width}))
+      return ParseStatus::Failure;
+    ImmVal = HwregEncoding::encode(HwReg.Val, Offset.Val, Width.Val);
   }
 
-  Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, AMDGPUOperand::ImmTyHwreg));
+  if (Res.isNoMatch() &&
+      parseExpr(ImmVal, "a hwreg macro, structured immediate"))
+    Res = ParseStatus::Success;
+
+  if (!Res.isSuccess())
+    return ParseStatus::Failure;
+
+  if (!isUInt<16>(ImmVal))
+    return Error(Loc, "invalid immediate: only 16-bit values are legal");
+  Operands.push_back(
+      AMDGPUOperand::CreateImm(this, ImmVal, Loc, AMDGPUOperand::ImmTyHwreg));
   return ParseStatus::Success;
 }
 
@@ -7307,10 +7320,10 @@ AMDGPUAsmParser::parseSendMsgBody(OperandInfoTy &Msg,
 
   Msg.Loc = getLoc();
   if (isToken(AsmToken::Identifier) &&
-      (Msg.Id = getMsgId(getTokenStr(), getSTI())) != OPR_ID_UNKNOWN) {
+      (Msg.Val = getMsgId(getTokenStr(), getSTI())) != OPR_ID_UNKNOWN) {
     Msg.IsSymbolic = true;
     lex(); // skip message name
-  } else if (!parseExpr(Msg.Id, "a message name")) {
+  } else if (!parseExpr(Msg.Val, "a message name")) {
     return false;
   }
 
@@ -7318,16 +7331,16 @@ AMDGPUAsmParser::parseSendMsgBody(OperandInfoTy &Msg,
     Op.IsDefined = true;
     Op.Loc = getLoc();
     if (isToken(AsmToken::Identifier) &&
-        (Op.Id = getMsgOpId(Msg.Id, getTokenStr())) >= 0) {
+        (Op.Val = getMsgOpId(Msg.Val, getTokenStr())) >= 0) {
       lex(); // skip operation name
-    } else if (!parseExpr(Op.Id, "an operation name")) {
+    } else if (!parseExpr(Op.Val, "an operation name")) {
       return false;
     }
 
     if (trySkipToken(AsmToken::Comma)) {
       Stream.IsDefined = true;
       Stream.Loc = getLoc();
-      if (!parseExpr(Stream.Id))
+      if (!parseExpr(Stream.Val))
         return false;
     }
   }
@@ -7347,17 +7360,17 @@ AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg,
   bool Strict = Msg.IsSymbolic;
 
   if (Strict) {
-    if (Msg.Id == OPR_ID_UNSUPPORTED) {
+    if (Msg.Val == OPR_ID_UNSUPPORTED) {
       Error(Msg.Loc, "specified message id is not supported on this GPU");
       return false;
     }
   } else {
-    if (!isValidMsgId(Msg.Id, getSTI())) {
+    if (!isValidMsgId(Msg.Val, getSTI())) {
       Error(Msg.Loc, "invalid message id");
       return false;
     }
   }
-  if (Strict && (msgRequiresOp(Msg.Id, getSTI()) != Op.IsDefined)) {
+  if (Strict && (msgRequiresOp(Msg.Val, getSTI()) != Op.IsDefined)) {
     if (Op.IsDefined) {
       Error(Op.Loc, "message does not support operations");
     } else {
@@ -7365,16 +7378,16 @@ AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg,
     }
     return false;
   }
-  if (!isValidMsgOp(Msg.Id, Op.Id, getSTI(), Strict)) {
+  if (!isValidMsgOp(Msg.Val, Op.Val, getSTI(), Strict)) {
     Error(Op.Loc, "invalid operation id");
     return false;
   }
-  if (Strict && !msgSupportsStream(Msg.Id, Op.Id, getSTI()) &&
+  if (Strict && !msgSupportsStream(Msg.Val, Op.Val, getSTI()) &&
       Stream.IsDefined) {
     Error(Stream.Loc, "message operation does not support streams");
     return false;
   }
-  if (!isValidMsgStream(Msg.Id, Op.Id, Stream.Id, getSTI(), Strict)) {
+  if (!isValidMsgStream(Msg.Val, Op.Val, Stream.Val, getSTI(), Strict)) {
     Error(Stream.Loc, "invalid message stream id");
     return false;
   }
@@ -7393,7 +7406,7 @@ ParseStatus AMDGPUAsmParser::parseSendMsg(OperandVector &Operands) {
     OperandInfoTy Stream(STREAM_ID_NONE_);
     if (parseSendMsgBody(Msg, Op, Stream) &&
         validateSendMsg(Msg, Op, Stream)) {
-      ImmVal = encodeMsg(Msg.Id, Op.Id, Stream.Id);
+      ImmVal = encodeMsg(Msg.Val, Op.Val, Stream.Val);
     } else {
       return ParseStatus::Failure;
     }
@@ -7730,6 +7743,48 @@ AMDGPUAsmParser::getConstLoc(const OperandVector &Operands) const {
   return getOperandLoc(Test, Operands);
 }
 
+ParseStatus
+AMDGPUAsmParser::parseStructuredOpFields(ArrayRef<StructuredOpField *> Fields) {
+  if (!trySkipToken(AsmToken::LCurly))
+    return ParseStatus::NoMatch;
+
+  bool First = true;
+  while (!trySkipToken(AsmToken::RCurly)) {
+    if (!First &&
+        !skipToken(AsmToken::Comma, "comma or closing brace expected"))
+      return ParseStatus::Failure;
+
+    StringRef Id = getTokenStr();
+    SMLoc IdLoc = getLoc();
+    if (!skipToken(AsmToken::Identifier, "field name expected") ||
+        !skipToken(AsmToken::Colon, "colon expected"))
+      return ParseStatus::Failure;
+
+    auto I =
+        find_if(Fields, [Id](StructuredOpField *F) { return F->Id == Id; });
+    if (I == Fields.end())
+      return Error(IdLoc, "unknown field");
+    if ((*I)->IsDefined)
+      return Error(IdLoc, "duplicate field");
+
+    // TODO: Support symbolic values.
+    (*I)->Loc = getLoc();
+    if (!parseExpr((*I)->Val))
+      return ParseStatus::Failure;
+    (*I)->IsDefined = true;
+
+    First = false;
+  }
+  return ParseStatus::Success;
+}
+
+bool AMDGPUAsmParser::validateStructuredOpFields(
+    ArrayRef<const StructuredOpField *> Fields) {
+  return all_of(Fields, [this](const StructuredOpField *F) {
+    return F->validate(*this);
+  });
+}
+
 //===----------------------------------------------------------------------===//
 // swizzle
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 515b9476b25b..074e13317ef8 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1259,6 +1259,10 @@ defm DS_PK_ADD_RTN_F16    : DS_Real_gfx12<0x0aa>;
 defm DS_PK_ADD_BF16       : DS_Real_gfx12<0x09b>;
 defm DS_PK_ADD_RTN_BF16   : DS_Real_gfx12<0x0ab>;
 
+// New aliases added in GFX12 without renaming the instructions.
+def : MnemonicAlias<"ds_subrev_u32", "ds_rsub_u32">, Requires<[isGFX12Plus]>;
+def : MnemonicAlias<"ds_subrev_u64", "ds_rsub_u64">, Requires<[isGFX12Plus]>;
+
 //===----------------------------------------------------------------------===//
 // GFX11.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index cdc9de7f65e3..aebfe154b313 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -409,9 +409,8 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
 
 // Sort recorded regions by pressure - highest at the front
 void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
-  const auto &ST = MF.getSubtarget<GCNSubtarget>();
-  llvm::sort(Regions, [&ST, TargetOcc](const Region *R1, const Region *R2) {
-    return R2->MaxPressure.less(ST, R1->MaxPressure, TargetOcc);
+  llvm::sort(Regions, [this, TargetOcc](const Region *R1, const Region *R2) {
+    return R2->MaxPressure.less(MF, R1->MaxPressure, TargetOcc);
   });
 }
 
@@ -517,26 +516,25 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
 // Minimal Register Strategy
 
 void GCNIterativeScheduler::scheduleMinReg(bool force) {
-  const auto &ST = MF.getSubtarget<GCNSubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   const auto TgtOcc = MFI->getOccupancy();
   sortRegionsByPressure(TgtOcc);
 
   auto MaxPressure = Regions.front()->MaxPressure;
   for (auto *R : Regions) {
-    if (!force && R->MaxPressure.less(ST, MaxPressure, TgtOcc))
+    if (!force && R->MaxPressure.less(MF, MaxPressure, TgtOcc))
       break;
 
     BuildDAG DAG(*R, *this);
     const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
 
     const auto RP = getSchedulePressure(*R, MinSchedule);
-    LLVM_DEBUG(if (R->MaxPressure.less(ST, RP, TgtOcc)) {
+    LLVM_DEBUG(if (R->MaxPressure.less(MF, RP, TgtOcc)) {
       dbgs() << "\nWarning: Pressure becomes worse after minreg!";
       printSchedRP(dbgs(), R->MaxPressure, RP);
     });
 
-    if (!force && MaxPressure.less(ST, RP, TgtOcc))
+    if (!force && MaxPressure.less(MF, RP, TgtOcc))
       break;
 
     scheduleRegion(*R, MinSchedule, RP);
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index fd8f0bebd3be..5c394e6d6296 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -88,9 +88,10 @@ void GCNRegPressure::inc(unsigned Reg,
   }
 }
 
-bool GCNRegPressure::less(const GCNSubtarget &ST,
-                          const GCNRegPressure& O,
+bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
                           unsigned MaxOccupancy) const {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+
   const auto SGPROcc = std::min(MaxOccupancy,
                                 ST.getOccupancyWithNumSGPRs(getSGPRNum()));
   const auto VGPROcc =
@@ -104,18 +105,103 @@ bool GCNRegPressure::less(const GCNSubtarget &ST,
 
   const auto Occ = std::min(SGPROcc, VGPROcc);
   const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
+
+  // Give first precedence to the better occupancy.
   if (Occ != OtherOcc)
     return Occ > OtherOcc;
 
+  unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
+  unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
+
+  // SGPR excess pressure conditions
+  unsigned ExcessSGPR = std::max(static_cast<int>(getSGPRNum() - MaxSGPRs), 0);
+  unsigned OtherExcessSGPR =
+      std::max(static_cast<int>(O.getSGPRNum() - MaxSGPRs), 0);
+
+  auto WaveSize = ST.getWavefrontSize();
+  // The number of virtual VGPRs required to handle excess SGPR
+  unsigned VGPRForSGPRSpills = (ExcessSGPR + (WaveSize - 1)) / WaveSize;
+  unsigned OtherVGPRForSGPRSpills =
+      (OtherExcessSGPR + (WaveSize - 1)) / WaveSize;
+
+  unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();
+
+  // Unified excess pressure conditions, accounting for VGPRs used for SGPR
+  // spills
+  unsigned ExcessVGPR =
+      std::max(static_cast<int>(getVGPRNum(ST.hasGFX90AInsts()) +
+                                VGPRForSGPRSpills - MaxVGPRs),
+               0);
+  unsigned OtherExcessVGPR =
+      std::max(static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts()) +
+                                OtherVGPRForSGPRSpills - MaxVGPRs),
+               0);
+  // Arch VGPR excess pressure conditions, accounting for VGPRs used for SGPR
+  // spills
+  unsigned ExcessArchVGPR = std::max(
+      static_cast<int>(getVGPRNum(false) + VGPRForSGPRSpills - MaxArchVGPRs),
+      0);
+  unsigned OtherExcessArchVGPR =
+      std::max(static_cast<int>(O.getVGPRNum(false) + OtherVGPRForSGPRSpills -
+                                MaxArchVGPRs),
+               0);
+  // AGPR excess pressure conditions
+  unsigned ExcessAGPR = std::max(
+      static_cast<int>(ST.hasGFX90AInsts() ? (getAGPRNum() - MaxArchVGPRs)
+                                           : (getAGPRNum() - MaxVGPRs)),
+      0);
+  unsigned OtherExcessAGPR = std::max(
+      static_cast<int>(ST.hasGFX90AInsts() ? (O.getAGPRNum() - MaxArchVGPRs)
+                                           : (O.getAGPRNum() - MaxVGPRs)),
+      0);
+
+  bool ExcessRP = ExcessSGPR || ExcessVGPR || ExcessArchVGPR || ExcessAGPR;
+  bool OtherExcessRP = OtherExcessSGPR || OtherExcessVGPR ||
+                       OtherExcessArchVGPR || OtherExcessAGPR;
+
+  // Give second precedence to the reduced number of spills to hold the register
+  // pressure.
+  if (ExcessRP || OtherExcessRP) {
+    // The difference in excess VGPR pressure, after including VGPRs used for
+    // SGPR spills
+    int VGPRDiff = ((OtherExcessVGPR + OtherExcessArchVGPR + OtherExcessAGPR) -
+                    (ExcessVGPR + ExcessArchVGPR + ExcessAGPR));
+
+    int SGPRDiff = OtherExcessSGPR - ExcessSGPR;
+
+    if (VGPRDiff != 0)
+      return VGPRDiff > 0;
+    if (SGPRDiff != 0) {
+      unsigned PureExcessVGPR =
+          std::max(static_cast<int>(getVGPRNum(ST.hasGFX90AInsts()) - MaxVGPRs),
+                   0) +
+          std::max(static_cast<int>(getVGPRNum(false) - MaxArchVGPRs), 0);
+      unsigned OtherPureExcessVGPR =
+          std::max(
+              static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts()) - MaxVGPRs),
+              0) +
+          std::max(static_cast<int>(O.getVGPRNum(false) - MaxArchVGPRs), 0);
+
+      // If we have a special case where there is a tie in excess VGPR, but one
+      // of the pressures has VGPR usage from SGPR spills, prefer the pressure
+      // with SGPR spills.
+      if (PureExcessVGPR != OtherPureExcessVGPR)
+        return SGPRDiff < 0;
+      // If both pressures have the same excess pressure before and after
+      // accounting for SGPR spills, prefer fewer SGPR spills.
+      return SGPRDiff > 0;
+    }
+  }
+
   bool SGPRImportant = SGPROcc < VGPROcc;
   const bool OtherSGPRImportant = OtherSGPROcc < OtherVGPROcc;
 
-  // if both pressures disagree on what is more important compare vgprs
+  // If both pressures disagree on what is more important compare vgprs.
   if (SGPRImportant != OtherSGPRImportant) {
     SGPRImportant = false;
   }
 
-  // compare large regs pressure
+  // Give third precedence to lower register tuple pressure.
   bool SGPRFirst = SGPRImportant;
   for (int I = 2; I > 0; --I, SGPRFirst = !SGPRFirst) {
     if (SGPRFirst) {
@@ -130,6 +216,8 @@ bool GCNRegPressure::less(const GCNSubtarget &ST,
         return VW < OtherVW;
     }
   }
+
+  // Give final precedence to lower general RP.
   return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()):
                          (getVGPRNum(ST.hasGFX90AInsts()) <
                           O.getVGPRNum(ST.hasGFX90AInsts()));
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 4100970fe1a9..752f53752fa6 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -74,8 +74,20 @@ struct GCNRegPressure {
     return getOccupancy(ST) > O.getOccupancy(ST);
   }
 
-  bool less(const GCNSubtarget &ST, const GCNRegPressure& O,
-    unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
+  /// Compares \p this GCNRegpressure to \p O, returning true if \p this is
+  /// less. Since GCNRegpressure contains different types of pressures, and due
+  /// to target-specific pecularities (e.g. we care about occupancy rather than
+  /// raw register usage), we determine if \p this GCNRegPressure is less than
+  /// \p O based on the following tiered comparisons (in order order of
+  /// precedence):
+  /// 1. Better occupancy
+  /// 2. Less spilling (first preference to VGPR spills, then to SGPR spills)
+  /// 3. Less tuple register pressure (first preference to VGPR tuples if we
+  /// determine that SGPR pressure is not important)
+  /// 4. Less raw register pressure (first preference to VGPR tuples if we
+  /// determine that SGPR pressure is not important)
+  bool less(const MachineFunction &MF, const GCNRegPressure &O,
+            unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
 
   bool operator==(const GCNRegPressure &O) const {
     return std::equal(&Value[0], &Value[TOTAL_KINDS], O.Value);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 3f3550d3029a..9f419a7fbf68 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -713,8 +713,8 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
     return false;
 
   SavedMutations.swap(DAG.Mutations);
-  DAG.addMutation(createIGroupLPDAGMutation(
-      AMDGPU::SchedulingPhase::PreRAReentry, nullptr));
+  DAG.addMutation(
+      createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry));
 
   InitialOccupancy = DAG.MinOccupancy;
   // Aggressivly try to reduce register pressure in the unclustered high RP
@@ -858,8 +858,7 @@ bool GCNSchedStage::initGCNRegion() {
                           StageID == GCNSchedStageID::ILPInitialSchedule;
     DAG.addMutation(createIGroupLPDAGMutation(
         IsInitialStage ? AMDGPU::SchedulingPhase::Initial
-                       : AMDGPU::SchedulingPhase::PreRAReentry,
-        &SavedMutations));
+                       : AMDGPU::SchedulingPhase::PreRAReentry));
   }
 
   return true;
@@ -977,6 +976,7 @@ void GCNSchedStage::checkScheduling() {
 
   unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
   unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
+
   if (PressureAfter.getVGPRNum(false) > MaxVGPRs ||
       PressureAfter.getAGPRNum() > MaxVGPRs ||
       PressureAfter.getSGPRNum() > MaxSGPRs) {
@@ -1199,9 +1199,8 @@ bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
 }
 
 bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
-  if (WavesAfter <= MFI.getMinWavesPerEU() &&
-      !PressureAfter.less(ST, PressureBefore) &&
-      isRegionWithExcessRP()) {
+  if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
+      !PressureAfter.less(MF, PressureBefore)) {
     LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
     return true;
   }
@@ -1573,8 +1572,7 @@ void GCNPostScheduleDAGMILive::schedule() {
   if (HasIGLPInstrs) {
     SavedMutations.clear();
     SavedMutations.swap(Mutations);
-    addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA,
-                                          &SavedMutations));
+    addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
   }
 
   ScheduleDAGMI::schedule();
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 46fceb8ae22c..a933c16b6ed5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1382,6 +1382,12 @@ public:
     return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
   }
 
+  /// \returns Addressable number of architectural VGPRs supported by the
+  /// subtarget.
+  unsigned getAddressableNumArchVGPRs() const {
+    return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this);
+  }
+
   /// \returns Addressable number of VGPRs supported by the subtarget.
   unsigned getAddressableNumVGPRs() const {
     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index d16d8ebd41a5..634b4aeb30a7 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -775,20 +775,23 @@ void SIFoldOperands::foldOperand(
     Register RegSeqDstReg = UseMI->getOperand(0).getReg();
     unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
 
-    for (auto &RSUse : make_early_inc_range(MRI->use_nodbg_operands(RegSeqDstReg))) {
-      MachineInstr *RSUseMI = RSUse.getParent();
+    // Grab the use operands first
+    SmallVector<MachineOperand *, 4> UsesToProcess;
+    for (auto &Use : MRI->use_nodbg_operands(RegSeqDstReg))
+      UsesToProcess.push_back(&Use);
+    for (auto *RSUse : UsesToProcess) {
+      MachineInstr *RSUseMI = RSUse->getParent();
 
       if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI,
-                         RSUseMI->getOperandNo(&RSUse), FoldList))
+                         RSUseMI->getOperandNo(RSUse), FoldList))
         continue;
 
-      if (RSUse.getSubReg() != RegSeqDstSubReg)
+      if (RSUse->getSubReg() != RegSeqDstSubReg)
         continue;
 
-      foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(&RSUse), FoldList,
+      foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
                   CopiesToReplace);
     }
-
     return;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index d774826c1d08..a8a33a5fecb4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -949,6 +949,8 @@ public:
       return AMDGPU::S_WAIT_BVHCNT;
     case AMDGPU::S_WAIT_DSCNT_soft:
       return AMDGPU::S_WAIT_DSCNT;
+    case AMDGPU::S_WAIT_KMCNT_soft:
+      return AMDGPU::S_WAIT_KMCNT;
     default:
       return Opcode;
     }
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index f62e808b33e4..4069a368f687 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -312,6 +312,10 @@ public:
                                               SIMemOp Op, bool IsVolatile,
                                               bool IsNonTemporal) const = 0;
 
+  virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
+    return false;
+  };
+
   /// Inserts any necessary instructions at position \p Pos relative
   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
   /// \p Op associated with address spaces \p AddrSpace have completed. Used
@@ -589,6 +593,15 @@ protected:
   bool setScope(const MachineBasicBlock::iterator MI,
                 AMDGPU::CPol::CPol Value) const;
 
+  // Stores with system scope (SCOPE_SYS) need to wait for:
+  // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
+  // - non-returning-atomics       - wait for STORECNT==0
+  //   TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
+  //   since it does not distinguish atomics-with-return from regular stores.
+  // There is no need to wait if memory is cached (mtype != UC).
+  bool
+  insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
+
 public:
   SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
 
@@ -603,6 +616,8 @@ public:
                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                       bool IsVolatile,
                                       bool IsNonTemporal) const override;
+
+  bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
 };
 
 class SIMemoryLegalizer final : public MachineFunctionPass {
@@ -2194,6 +2209,22 @@ bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
   return false;
 }
 
+bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
+    const MachineBasicBlock::iterator MI) const {
+  // TODO: implement flag for frontend to give us a hint not to insert waits.
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  const DebugLoc &DL = MI->getDebugLoc();
+
+  BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
+  BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
+  BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
+  BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
+  BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
+
+  return true;
+}
+
 bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
                                      SIAtomicScope Scope,
                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
@@ -2364,6 +2395,9 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
   if (IsVolatile) {
     Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
 
+    if (Op == SIMemOp::STORE)
+      Changed |= insertWaitsBeforeSystemScopeStore(MI);
+
     // Ensure operation has completed at system scope to cause all volatile
     // operations to be visible outside the program in a global order. Do not
     // request cross address space as only the global address space can be
@@ -2381,6 +2415,15 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
   return Changed;
 }
 
+bool SIGfx12CacheControl::expandSystemScopeStore(
+    MachineBasicBlock::iterator &MI) const {
+  MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
+  if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
+    return insertWaitsBeforeSystemScopeStore(MI);
+
+  return false;
+}
+
 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
   if (AtomicPseudoMIs.empty())
     return false;
@@ -2467,6 +2510,10 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
   Changed |= CC->enableVolatileAndOrNonTemporal(
       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
       MOI.isNonTemporal());
+
+  // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
+  // instruction field, do not confuse it with atomic scope.
+  Changed |= CC->expandSystemScopeStore(MI);
   return Changed;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 53fc2c068624..afc380b42034 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -472,12 +472,11 @@ bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
   }
 
   // Move MI before v_or_b32
-  auto MBB = MI.getParent();
-  MBB->remove(&MI);
-  MBB->insert(getParentInst(), &MI);
+  MI.getParent()->remove(&MI);
+  getParentInst()->getParent()->insert(getParentInst(), &MI);
 
   // Add Implicit use of preserved register
-  MachineInstrBuilder MIB(*MBB->getParent(), MI);
+  MachineInstrBuilder MIB(*MI.getMF(), MI);
   MIB.addReg(getPreservedOperand()->getReg(),
              RegState::ImplicitKill,
              getPreservedOperand()->getSubReg());
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 0fe2845f8edc..b5de311f8c58 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1601,6 +1601,7 @@ let SubtargetPredicate = isGFX12Plus in {
   def S_WAIT_SAMPLECNT_soft : SOPP_Pseudo <"s_soft_wait_samplecnt", (ins s16imm:$simm16), "$simm16">;
   def S_WAIT_BVHCNT_soft : SOPP_Pseudo <"s_soft_wait_bvhcnt", (ins s16imm:$simm16), "$simm16">;
   def S_WAIT_DSCNT_soft : SOPP_Pseudo <"s_soft_wait_dscnt", (ins s16imm:$simm16), "$simm16">;
+  def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
 }
 
 def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index ce91e05e5cc8..963dc2882fcc 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1107,10 +1107,12 @@ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
   return IsWave32 ? 1024 : 512;
 }
 
+unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI) { return 256; }
+
 unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
   if (STI->getFeatureBits().test(FeatureGFX90AInsts))
     return 512;
-  return 256;
+  return getAddressableNumArchVGPRs(STI);
 }
 
 unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
@@ -1698,16 +1700,6 @@ int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI) {
   return (Idx < 0) ? Idx : Opr[Idx].Encoding;
 }
 
-bool isValidHwreg(int64_t Id) { return 0 <= Id && isUInt<HwregId::Width>(Id); }
-
-bool isValidHwregOffset(int64_t Offset) {
-  return 0 <= Offset && isUInt<HwregOffset::Width>(Offset);
-}
-
-bool isValidHwregWidth(int64_t Width) {
-  return 0 <= (Width - 1) && isUInt<HwregSize::Width>(Width - 1);
-}
-
 StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI) {
   int Idx = getOprIdx<const MCSubtargetInfo &>(Id, Opr, OPR_SIZE, STI);
   return (Idx < 0) ? "" : Opr[Idx].Name;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 6826cd273195..b2fc7d874fe5 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -295,6 +295,10 @@ unsigned getVGPREncodingGranule(
 /// \returns Total number of VGPRs for given subtarget \p STI.
 unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI);
 
+/// \returns Addressable number of architectural VGPRs for a given subtarget \p
+/// STI.
+unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI);
+
 /// \returns Addressable number of VGPRs for given subtarget \p STI.
 unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI);
 
@@ -1065,15 +1069,6 @@ LLVM_READONLY
 int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI);
 
 LLVM_READNONE
-bool isValidHwreg(int64_t Id);
-
-LLVM_READNONE
-bool isValidHwregOffset(int64_t Offset);
-
-LLVM_READNONE
-bool isValidHwregWidth(int64_t Width);
-
-LLVM_READNONE
 StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI);
 
 } // namespace Hwreg
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index f5424cf48d7a..dbb1977183d1 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -636,8 +636,8 @@ def VOPProfile_Base_CVT_F32_F8_OpSel : VOPProfile<[f32, i32, untyped, untyped]>
   let Src1VOP3DPP = Src1RC64;
 }
 
-let SubtargetPredicate = isGFX12Plus, mayRaiseFPException = 0,
-    SchedRW = [WriteFloatCvt] in {
+let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts],
+    mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in {
   defm V_CVT_F32_FP8_OP_SEL    : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>;
   defm V_CVT_F32_BF8_OP_SEL    : VOP1Inst<"v_cvt_f32_bf8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>;
   defm V_CVT_PK_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_fp8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>;
@@ -1422,12 +1422,10 @@ defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
 let AssemblerPredicate = isGFX940Plus in
 defm V_MOV_B64 : VOP1_Real_gfx9 <0x38>;
 
-let OtherPredicates = [HasFP8ConversionInsts] in {
 defm V_CVT_F32_FP8       : VOP1_Real_NoDstSel_SDWA_gfx9<0x54>;
 defm V_CVT_F32_BF8       : VOP1_Real_NoDstSel_SDWA_gfx9<0x55>;
 defm V_CVT_PK_F32_FP8    : VOP1_Real_NoDstSel_SDWA_gfx9<0x56>;
 defm V_CVT_PK_F32_BF8    : VOP1_Real_NoDstSel_SDWA_gfx9<0x57>;
-}
 
 //===----------------------------------------------------------------------===//
 // GFX10
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 7198a4022dae..334cfad478f1 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1647,9 +1647,7 @@ defm V_CVT_PKNORM_U16_F16 : VOP3OpSel_Real_gfx9 <0x29a>;
 
 defm V_LSHL_ADD_U64 : VOP3_Real_vi <0x208>;
 
-let OtherPredicates = [HasFP8ConversionInsts] in {
 defm V_CVT_PK_FP8_F32 : VOP3OpSel_Real_gfx9 <0x2a2>;
 defm V_CVT_PK_BF8_F32 : VOP3OpSel_Real_gfx9 <0x2a3>;
 defm V_CVT_SR_FP8_F32 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x2a4>;
 defm V_CVT_SR_BF8_F32 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x2a5>;
-}
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 80d7d96a5e3c..918bdb9506b0 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -670,6 +670,7 @@ class Base_VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
 
   let SubtargetPredicate = HasSDWA9;
   let AssemblerPredicate = HasSDWA9;
+  let OtherPredicates    = ps.OtherPredicates;
   let AsmVariantName = !if(ps.Pfl.HasExtSDWA9, AMDGPUAsmVariants.SDWA9,
                                                AMDGPUAsmVariants.Disable);
   let DecoderNamespace = "GFX9";
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index f0b69b0b0980..e78ea63c3b4a 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -1468,15 +1468,21 @@ void ARMExpandPseudo::CMSESaveClearFPRegsV8(
   if (passesFPReg)
     assert(STI->hasFPRegs() && "Subtarget needs fpregs");
 
-  // Lazy store all fp registers to the stack.
+  // Lazy store all fp registers to the stack
   // This executes as NOP in the absence of floating-point support.
-  MachineInstrBuilder VLSTM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
-                                  .addReg(ARM::SP)
-                                  .add(predOps(ARMCC::AL));
-  for (auto R : {ARM::VPR, ARM::FPSCR, ARM::FPSCR_NZCV, ARM::Q0, ARM::Q1,
-                 ARM::Q2, ARM::Q3, ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7})
-    VLSTM.addReg(R, RegState::Implicit |
-                        (LiveRegs.contains(R) ? 0 : RegState::Undef));
+  MachineInstrBuilder VLSTM =
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
+          .addReg(ARM::SP)
+          .add(predOps(ARMCC::AL))
+          .addImm(0); // Represents a pseoudo register list, has no effect on
+                      // the encoding.
+  // Mark non-live registers as undef
+  for (MachineOperand &MO : VLSTM->implicit_operands()) {
+    if (MO.isReg() && !MO.isDef()) {
+      Register Reg = MO.getReg();
+      MO.setIsUndef(!LiveRegs.contains(Reg));
+    }
+  }
 
   // Restore all arguments
   for (const auto &Regs : ClearedFPRegs) {
@@ -1563,14 +1569,20 @@ void ARMExpandPseudo::CMSESaveClearFPRegsV81(MachineBasicBlock &MBB,
         .addImm(CMSE_FP_SAVE_SIZE >> 2)
         .add(predOps(ARMCC::AL));
 
-    // Lazy store all FP registers to the stack
-    MachineInstrBuilder VLSTM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
-                                    .addReg(ARM::SP)
-                                    .add(predOps(ARMCC::AL));
-    for (auto R : {ARM::VPR, ARM::FPSCR, ARM::FPSCR_NZCV, ARM::Q0, ARM::Q1,
-                   ARM::Q2, ARM::Q3, ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7})
-      VLSTM.addReg(R, RegState::Implicit |
-                          (LiveRegs.contains(R) ? 0 : RegState::Undef));
+    // Lazy store all fp registers to the stack.
+    MachineInstrBuilder VLSTM =
+        BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
+            .addReg(ARM::SP)
+            .add(predOps(ARMCC::AL))
+            .addImm(0); // Represents a pseoudo register list, has no effect on
+                        // the encoding.
+    // Mark non-live registers as undef
+    for (MachineOperand &MO : VLSTM->implicit_operands()) {
+      if (MO.isReg() && MO.isImplicit() && !MO.isDef()) {
+        Register Reg = MO.getReg();
+        MO.setIsUndef(!LiveRegs.contains(Reg));
+      }
+    }
   } else {
     // Push all the callee-saved registers (s16-s31).
     MachineInstrBuilder VPUSH =
@@ -1673,9 +1685,12 @@ void ARMExpandPseudo::CMSERestoreFPRegsV8(
 
   // Lazy load fp regs from stack.
   // This executes as NOP in the absence of floating-point support.
-  MachineInstrBuilder VLLDM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM))
-                                  .addReg(ARM::SP)
-                                  .add(predOps(ARMCC::AL));
+  MachineInstrBuilder VLLDM =
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM))
+          .addReg(ARM::SP)
+          .add(predOps(ARMCC::AL))
+          .addImm(0); // Represents a pseoudo register list, has no effect on
+                      // the encoding.
 
   if (STI->fixCMSE_CVE_2021_35465()) {
     auto Bundler = MIBundleBuilder(MBB, VLLDM);
@@ -1757,7 +1772,9 @@ void ARMExpandPseudo::CMSERestoreFPRegsV81(
     // Load FP registers from stack.
     BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM))
         .addReg(ARM::SP)
-        .add(predOps(ARMCC::AL));
+        .add(predOps(ARMCC::AL))
+        .addImm(0); // Represents a pseoudo register list, has no effect on the
+                    // encoding.
 
     // Pop the stack space
     BuildMI(MBB, MBBI, DL, TII->get(ARM::tADDspi), ARM::SP)
diff --git a/llvm/lib/Target/ARM/ARMInstrFormats.td b/llvm/lib/Target/ARM/ARMInstrFormats.td
index 14e315534570..404085820a66 100644
--- a/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -1749,6 +1749,37 @@ class AXSI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
   let Inst{8}     = 0;          // Single precision
 }
 
+// Single Precision with fixed registers.
+// For when the registers-to-be-stored/loaded are fixed, e.g. VLLDM and VLSTM
+class AXSI4FR<string asm, bit et, bit load>
+    : InstARM<AddrMode4, 4, IndexModeNone, VFPLdStMulFrm, VFPDomain, "", NoItinerary> {
+  // Instruction operands.
+  bits<4> Rn;
+  bits<13> regs;    // Does not affect encoding, for assembly/disassembly only.
+  list<Predicate> Predicates = [HasVFP2];
+  let OutOperandList = (outs);
+  let InOperandList = (ins GPRnopc:$Rn, pred:$p, dpr_reglist:$regs);
+  let AsmString = asm;
+  let Pattern = [];
+  let DecoderNamespace = "VFP";
+  // Encode instruction operands.
+  let Inst{19-16} = Rn;
+  let Inst{31-28} = 0b1110;
+  let Inst{27-25} = 0b110;
+  let Inst{24}    = 0b0;
+  let Inst{23}    = 0b0;
+  let Inst{22}    = 0b0;
+  let Inst{21}    = 0b1;
+  let Inst{20}    = load;       // Distinguishes vlldm from vlstm
+  let Inst{15-12} = 0b0000;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision
+  let Inst{7}     = et;         // encoding type, 0 for T1 and 1 for T2.
+  let Inst{6-0}   = 0b0000000;
+  let mayLoad     = load;
+  let mayStore    = !eq(load, 0);
+}
+
 // Double precision, unary
 class ADuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
            bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td
index 55d3efbd9b9a..3094a4db2b4d 100644
--- a/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -313,29 +313,51 @@ def : MnemonicAlias<"vstm", "vstmia">;
 //===----------------------------------------------------------------------===//
 //  Lazy load / store multiple Instructions
 //
-def VLLDM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
-                  NoItinerary, "vlldm${p}\t$Rn", "", []>,
+// VLLDM and VLSTM:
+// 2 encoding options:
+// T1 (bit 7 is 0):
+// T1 takes an optional dpr_reglist, must be '{d0-d15}' (exactly)
+// T1 require v8-M.Main, secure state, target with 16 D registers (or with no D registers - NOP)
+// T2 (bit 7 is 1):
+// T2 takes a mandatory dpr_reglist, must be '{d0-d31}' (exactly)
+// T2 require v8.1-M.Main, secure state, target with 16/32 D registers (or with no D registers - NOP)
+// (source: Arm v8-M ARM, DDI0553B.v ID16122022)
+
+def VLLDM : AXSI4FR<"vlldm${p}\t$Rn, $regs", 0, 1>,
             Requires<[HasV8MMainline, Has8MSecExt]> {
-    let Inst{24-23} = 0b00;
-    let Inst{22}    = 0;
-    let Inst{21}    = 1;
-    let Inst{20}    = 1;
-    let Inst{15-12} = 0;
-    let Inst{7-0}   = 0;
-    let mayLoad     = 1;
-    let Defs = [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, VPR, FPSCR, FPSCR_NZCV];
-}
-
-def VLSTM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
-                  NoItinerary, "vlstm${p}\t$Rn", "", []>,
+    let Defs = [VPR, FPSCR, FPSCR_NZCV, D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15];
+    let DecoderMethod = "DecodeLazyLoadStoreMul";
+}
+// T1: assembly does not contains the register list.
+def : InstAlias<"vlldm${p}\t$Rn", (VLLDM GPRnopc:$Rn, pred:$p, 0)>,
+                Requires<[HasV8MMainline, Has8MSecExt]>;
+// T2: assembly must contains the register list.
+// The register list has no effect on the encoding, it is for assembly/disassembly purposes only.
+def VLLDM_T2 : AXSI4FR<"vlldm${p}\t$Rn, $regs", 1, 1>,
+            Requires<[HasV8_1MMainline, Has8MSecExt]> {
+    let Defs = [VPR, FPSCR, FPSCR_NZCV, D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
+                                        D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, D31];
+    let DecoderMethod = "DecodeLazyLoadStoreMul";
+}
+// T1: assembly contains the register list.
+// The register list has no effect on the encoding, it is for assembly/disassembly purposes only.
+def VLSTM : AXSI4FR<"vlstm${p}\t$Rn, $regs", 0, 0>,
             Requires<[HasV8MMainline, Has8MSecExt]> {
-    let Inst{24-23} = 0b00;
-    let Inst{22}    = 0;
-    let Inst{21}    = 1;
-    let Inst{20}    = 0;
-    let Inst{15-12} = 0;
-    let Inst{7-0}   = 0;
-    let mayStore    = 1;
+    let Defs = [VPR, FPSCR, FPSCR_NZCV];
+    let Uses = [VPR, FPSCR, FPSCR_NZCV, D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15];
+    let DecoderMethod = "DecodeLazyLoadStoreMul";
+}
+// T1: assembly does not contain the register list.
+def : InstAlias<"vlstm${p}\t$Rn", (VLSTM GPRnopc:$Rn, pred:$p, 0)>,
+                Requires<[HasV8MMainline, Has8MSecExt]>;
+// T2: assembly must contain the register list.
+// The register list has no effect on the encoding, it is for assembly/disassembly purposes only.
+def VLSTM_T2 : AXSI4FR<"vlstm${p}\t$Rn, $regs", 1, 0>,
+            Requires<[HasV8_1MMainline, Has8MSecExt]> {
+    let Defs = [VPR, FPSCR, FPSCR_NZCV];
+    let Uses = [VPR, FPSCR, FPSCR_NZCV, D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
+                                        D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, D31];
+    let DecoderMethod = "DecodeLazyLoadStoreMul";
 }
 
 def : InstAlias<"vpush${p} $r", (VSTMDDB_UPD SP, pred:$p, dpr_reglist:$r), 0>,
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 37bfb76a494d..5efbaf0d4106 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -450,11 +450,12 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool validatetSTMRegList(const MCInst &Inst, const OperandVector &Operands,
                            unsigned ListNo);
 
-  int tryParseRegister();
+  int tryParseRegister(bool AllowOutofBoundReg = false);
   bool tryParseRegisterWithWriteBack(OperandVector &);
   int tryParseShiftRegister(OperandVector &);
   bool parseRegisterList(OperandVector &, bool EnforceOrder = true,
-                         bool AllowRAAC = false);
+                         bool AllowRAAC = false,
+                         bool AllowOutOfBoundReg = false);
   bool parseMemory(OperandVector &);
   bool parseOperand(OperandVector &, StringRef Mnemonic);
   bool parseImmExpr(int64_t &Out);
@@ -4072,7 +4073,7 @@ ParseStatus ARMAsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc,
 /// Try to parse a register name.  The token must be an Identifier when called,
 /// and if it is a register name the token is eaten and the register number is
 /// returned.  Otherwise return -1.
-int ARMAsmParser::tryParseRegister() {
+int ARMAsmParser::tryParseRegister(bool AllowOutOfBoundReg) {
   MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier)) return -1;
@@ -4116,7 +4117,8 @@ int ARMAsmParser::tryParseRegister() {
   }
 
   // Some FPUs only have 16 D registers, so D16-D31 are invalid
-  if (!hasD32() && RegNum >= ARM::D16 && RegNum <= ARM::D31)
+  if (!AllowOutOfBoundReg && !hasD32() && RegNum >= ARM::D16 &&
+      RegNum <= ARM::D31)
     return -1;
 
   Parser.Lex(); // Eat identifier token.
@@ -4456,7 +4458,7 @@ insertNoDuplicates(SmallVectorImpl<std::pair<unsigned, unsigned>> &Regs,
 
 /// Parse a register list.
 bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
-                                     bool AllowRAAC) {
+                                     bool AllowRAAC, bool AllowOutOfBoundReg) {
   MCAsmParser &Parser = getParser();
   if (Parser.getTok().isNot(AsmToken::LCurly))
     return TokError("Token is not a Left Curly Brace");
@@ -4510,7 +4512,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
         return Error(RegLoc, "pseudo-register not allowed");
       Parser.Lex(); // Eat the minus.
       SMLoc AfterMinusLoc = Parser.getTok().getLoc();
-      int EndReg = tryParseRegister();
+      int EndReg = tryParseRegister(AllowOutOfBoundReg);
       if (EndReg == -1)
         return Error(AfterMinusLoc, "register expected");
       if (EndReg == ARM::RA_AUTH_CODE)
@@ -4545,7 +4547,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
     RegLoc = Parser.getTok().getLoc();
     int OldReg = Reg;
     const AsmToken RegTok = Parser.getTok();
-    Reg = tryParseRegister();
+    Reg = tryParseRegister(AllowOutOfBoundReg);
     if (Reg == -1)
       return Error(RegLoc, "register expected");
     if (!AllowRAAC && Reg == ARM::RA_AUTH_CODE)
@@ -6085,8 +6087,11 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
   }
   case AsmToken::LBrac:
     return parseMemory(Operands);
-  case AsmToken::LCurly:
-    return parseRegisterList(Operands, !Mnemonic.starts_with("clr"));
+  case AsmToken::LCurly: {
+    bool AllowOutOfBoundReg = Mnemonic == "vlldm" || Mnemonic == "vlstm";
+    return parseRegisterList(Operands, !Mnemonic.starts_with("clr"), false,
+                             AllowOutOfBoundReg);
+  }
   case AsmToken::Dollar:
   case AsmToken::Hash: {
     // #42 -> immediate
@@ -7596,6 +7601,33 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
 
   const unsigned Opcode = Inst.getOpcode();
   switch (Opcode) {
+  case ARM::VLLDM:
+  case ARM::VLLDM_T2:
+  case ARM::VLSTM:
+  case ARM::VLSTM_T2: {
+    // Since in some cases both T1 and T2 are valid, tablegen can not always
+    // pick the correct instruction.
+    if (Operands.size() == 4) { // a register list has been provided
+      ARMOperand &Op = static_cast<ARMOperand &>(
+          *Operands[3]); // the register list, a dpr_reglist
+      assert(Op.isDPRRegList());
+      auto &RegList = Op.getRegList();
+      // T2 requires v8.1-M.Main (cannot be handled by tablegen)
+      if (RegList.size() == 32 && !hasV8_1MMainline()) {
+        return Error(Op.getEndLoc(), "T2 version requires v8.1-M.Main");
+      }
+      // When target has 32 D registers, T1 is undefined.
+      if (hasD32() && RegList.size() != 32) {
+        return Error(Op.getEndLoc(), "operand must be exactly {d0-d31}");
+      }
+      // When target has 16 D registers, both T1 and T2 are valid.
+      if (!hasD32() && (RegList.size() != 16 && RegList.size() != 32)) {
+        return Error(Op.getEndLoc(),
+                     "operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)");
+      }
+    }
+    return false;
+  }
   case ARM::t2IT: {
     // Encoding is unpredictable if it ever results in a notional 'NV'
     // predicate. Since we don't parse 'NV' directly this means an 'AL'
@@ -8731,6 +8763,32 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
   }
 
   switch (Inst.getOpcode()) {
+  case ARM::VLLDM:
+  case ARM::VLSTM: {
+    // In some cases both T1 and T2 are valid, causing tablegen pick T1 instead
+    // of T2
+    if (Operands.size() == 4) { // a register list has been provided
+      ARMOperand &Op = static_cast<ARMOperand &>(
+          *Operands[3]); // the register list, a dpr_reglist
+      assert(Op.isDPRRegList());
+      auto &RegList = Op.getRegList();
+      // When the register list is {d0-d31} the instruction has to be the T2
+      // variant
+      if (RegList.size() == 32) {
+        const unsigned Opcode =
+            (Inst.getOpcode() == ARM::VLLDM) ? ARM::VLLDM_T2 : ARM::VLSTM_T2;
+        MCInst TmpInst;
+        TmpInst.setOpcode(Opcode);
+        TmpInst.addOperand(Inst.getOperand(0));
+        TmpInst.addOperand(Inst.getOperand(1));
+        TmpInst.addOperand(Inst.getOperand(2));
+        TmpInst.addOperand(Inst.getOperand(3));
+        Inst = TmpInst;
+        return true;
+      }
+    }
+    return false;
+  }
   // Alias for alternate form of 'ldr{,b}t Rt, [Rn], #imm' instruction.
   case ARM::LDRT_POST:
   case ARM::LDRBT_POST: {
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 604f22d71119..705f3cbce12f 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -700,6 +700,9 @@ DecodeMVEOverlappingLongShift(MCInst &Inst, unsigned Insn, uint64_t Address,
 static DecodeStatus DecodeT2AddSubSPImm(MCInst &Inst, unsigned Insn,
                                         uint64_t Address,
                                         const MCDisassembler *Decoder);
+static DecodeStatus DecodeLazyLoadStoreMul(MCInst &Inst, unsigned Insn,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder);
 
 #include "ARMGenDisassemblerTables.inc"
 
@@ -7030,3 +7033,23 @@ static DecodeStatus DecodeT2AddSubSPImm(MCInst &Inst, unsigned Insn,
 
   return DS;
 }
+
+static DecodeStatus DecodeLazyLoadStoreMul(MCInst &Inst, unsigned Insn,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  const unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  // Adding Rn, holding memory location to save/load to/from, the only argument
+  // that is being encoded.
+  // '$Rn' in the assembly.
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // An optional predicate, '$p' in the assembly.
+  DecodePredicateOperand(Inst, ARMCC::AL, Address, Decoder);
+  // An immediate that represents a floating point registers list. '$regs' in
+  // the assembly.
+  Inst.addOperand(MCOperand::createImm(0)); // Arbitrary value, has no effect.
+
+  return S;
+}
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
index fbd067d79af0..24e627cd9a4e 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
@@ -91,6 +91,38 @@ void ARMInstPrinter::printInst(const MCInst *MI, uint64_t Address,
   unsigned Opcode = MI->getOpcode();
 
   switch (Opcode) {
+  case ARM::VLLDM: {
+    const MCOperand &Reg = MI->getOperand(0);
+    O << '\t' << "vlldm" << '\t';
+    printRegName(O, Reg.getReg());
+    O << ", "
+      << "{d0 - d15}";
+    return;
+  }
+  case ARM::VLLDM_T2: {
+    const MCOperand &Reg = MI->getOperand(0);
+    O << '\t' << "vlldm" << '\t';
+    printRegName(O, Reg.getReg());
+    O << ", "
+      << "{d0 - d31}";
+    return;
+  }
+  case ARM::VLSTM: {
+    const MCOperand &Reg = MI->getOperand(0);
+    O << '\t' << "vlstm" << '\t';
+    printRegName(O, Reg.getReg());
+    O << ", "
+      << "{d0 - d15}";
+    return;
+  }
+  case ARM::VLSTM_T2: {
+    const MCOperand &Reg = MI->getOperand(0);
+    O << '\t' << "vlstm" << '\t';
+    printRegName(O, Reg.getReg());
+    O << ", "
+      << "{d0 - d31}";
+    return;
+  }
   // Check for MOVs and print canonical forms, instead.
   case ARM::MOVsr: {
     // FIXME: Thumb variants?
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
index bbb564356602..66a9dc46bcbf 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
@@ -51,7 +51,7 @@ void ComputedShaderFlags::print(raw_ostream &OS) const {
   if (FlagVal == 0)
     return;
   OS << "; Note: shader requires additional functionality:\n";
-#define SHADER_FLAG(bit, FlagName, Str)                                        \
+#define SHADER_FEATURE_FLAG(bit, FlagName, Str)                                \
   if (FlagName)                                                                \
     OS << ";       " Str "\n";
 #include "llvm/BinaryFormat/DXContainerConstants.def"
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.h b/llvm/lib/Target/DirectX/DXILShaderFlags.h
index 4f51873a2d0b..574a7b090f52 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.h
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.h
@@ -29,17 +29,17 @@ class GlobalVariable;
 namespace dxil {
 
 struct ComputedShaderFlags {
-#define SHADER_FLAG(bit, FlagName, Str) bool FlagName : 1;
+#define SHADER_FEATURE_FLAG(bit, FlagName, Str) bool FlagName : 1;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 
-#define SHADER_FLAG(bit, FlagName, Str) FlagName = false;
+#define SHADER_FEATURE_FLAG(bit, FlagName, Str) FlagName = false;
   ComputedShaderFlags() {
 #include "llvm/BinaryFormat/DXContainerConstants.def"
   }
 
   operator uint64_t() const {
     uint64_t FlagValue = 0;
-#define SHADER_FLAG(bit, FlagName, Str)                                        \
+#define SHADER_FEATURE_FLAG(bit, FlagName, Str)                                \
   FlagValue |=                                                                 \
       FlagName ? static_cast<uint64_t>(dxbc::FeatureFlags::FlagName) : 0ull;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt
index 2870f0bb6ad3..a22a5c11e6ab 100644
--- a/llvm/lib/Target/Hexagon/CMakeLists.txt
+++ b/llvm/lib/Target/Hexagon/CMakeLists.txt
@@ -51,7 +51,6 @@ add_llvm_target(HexagonCodeGen
   HexagonOptAddrMode.cpp
   HexagonOptimizeSZextends.cpp
   HexagonPeephole.cpp
-  HexagonPostIncOpt.cpp
   HexagonRDFOpt.cpp
   HexagonRegisterInfo.cpp
   HexagonSelectionDAGInfo.cpp
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 91cc9307786b..619c7dc69f9b 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -1655,13 +1655,6 @@ bool HexagonInstrInfo::isPostIncrement(const MachineInstr &MI) const {
   return getAddrMode(MI) == HexagonII::PostInc;
 }
 
-bool HexagonInstrInfo::isPostIncWithImmOffset(const MachineInstr &MI) const {
-  unsigned BasePos, OffsetPos;
-  if (!getBaseAndOffsetPosition(MI, BasePos, OffsetPos))
-    return false;
-  return isPostIncrement(MI) && MI.getOperand(OffsetPos).isImm();
-}
-
 // Returns true if an instruction is predicated irrespective of the predicate
 // sense. For example, all of the following will return true.
 // if (p0) R1 = add(R2, R3)
@@ -2443,55 +2436,6 @@ bool HexagonInstrInfo::isLoopN(const MachineInstr &MI) const {
          Opcode == Hexagon::J2_loop1rext;
 }
 
-bool HexagonInstrInfo::isCircBufferInstr(const MachineInstr &MI) const {
-  switch (MI.getOpcode()) {
-  default:
-    return false;
-  case Hexagon::L2_loadalignb_pci:
-  case Hexagon::L2_loadalignb_pcr:
-  case Hexagon::L2_loadalignh_pci:
-  case Hexagon::L2_loadalignh_pcr:
-  case Hexagon::L2_loadbsw2_pci:
-  case Hexagon::L2_loadbsw2_pcr:
-  case Hexagon::L2_loadbsw4_pci:
-  case Hexagon::L2_loadbsw4_pcr:
-  case Hexagon::L2_loadbzw2_pci:
-  case Hexagon::L2_loadbzw2_pcr:
-  case Hexagon::L2_loadbzw4_pci:
-  case Hexagon::L2_loadbzw4_pcr:
-  case Hexagon::L2_loadrb_pci:
-  case Hexagon::L2_loadrb_pcr:
-  case Hexagon::L2_loadrd_pci:
-  case Hexagon::L2_loadrd_pcr:
-  case Hexagon::L2_loadrh_pci:
-  case Hexagon::L2_loadrh_pcr:
-  case Hexagon::L2_loadri_pci:
-  case Hexagon::L2_loadri_pcr:
-  case Hexagon::L2_loadrub_pci:
-  case Hexagon::L2_loadrub_pcr:
-  case Hexagon::L2_loadruh_pci:
-  case Hexagon::L2_loadruh_pcr:
-  case Hexagon::S2_storerbnew_pci:
-  case Hexagon::S2_storerbnew_pcr:
-  case Hexagon::S2_storerb_pci:
-  case Hexagon::S2_storerb_pcr:
-  case Hexagon::S2_storerd_pci:
-  case Hexagon::S2_storerd_pcr:
-  case Hexagon::S2_storerf_pci:
-  case Hexagon::S2_storerf_pcr:
-  case Hexagon::S2_storerhnew_pci:
-  case Hexagon::S2_storerhnew_pcr:
-  case Hexagon::S2_storerh_pci:
-  case Hexagon::S2_storerh_pcr:
-  case Hexagon::S2_storerinew_pci:
-  case Hexagon::S2_storerinew_pcr:
-  case Hexagon::S2_storeri_pci:
-  case Hexagon::S2_storeri_pcr:
-    return true;
-  }
-  return false;
-}
-
 bool HexagonInstrInfo::isMemOp(const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
     default: return false;
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index 65783c560321..e496995d3ff1 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -434,8 +434,6 @@ public:
   bool predCanBeUsedAsDotNew(const MachineInstr &MI, Register PredReg) const;
   bool PredOpcodeHasJMP_c(unsigned Opcode) const;
   bool predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const;
-  bool isPostIncWithImmOffset(const MachineInstr &MI) const;
-  bool isCircBufferInstr(const MachineInstr &MI) const;
 
   unsigned getAddrMode(const MachineInstr &MI) const;
   MachineOperand *getBaseAndOffset(const MachineInstr &MI, int64_t &Offset,
diff --git a/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp b/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp
deleted file mode 100644
index 4c845f24f76a..000000000000
--- a/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp
+++ /dev/null
@@ -1,689 +0,0 @@
-//===-- HexagonPostIncOpt.cpp - Hexagon Post Increment Optimization Pass --===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// Convert post-inc addressing mode into base-offset addressing mode.
-// Ex:
-// original loop:
-// v1 = phi(v0, v3)
-// v2,v3 = post_load v1, 4
-
-// Often, unroller creates below form of post-increments:
-// v1 = phi(v0, v3')
-// v2,v3  = post_load v1, 4
-// v2',v3'= post_load v3, 4
-
-// This can be optimized in two ways
-
-// 1.
-// v1 = phi(v0, v3')
-// v2,v3' = post_load v1, 8
-// v2' = load v3', -4
-//
-// 2.
-// v1 = phi(v0, v3')
-// v2,v3' = post_load v1, 8
-// v2' = load v1, 4
-//
-// Option 2 is favored as we can packetize two memory operations in a single
-// packet. However, this is not always favorable due to memory dependences
-// and in cases where we form a bigger chain of post-increment ops that will
-// create more spills as we can not execute post-increment ops with out
-// executing base-offset instructions.
-//===----------------------------------------------------------------------===//
-#include "HexagonInstrInfo.h"
-#include "HexagonSubtarget.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/ScheduleDAGInstrs.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "hexagon-postincopt"
-
-static cl::opt<unsigned> PostIncChainThreshold(
-    "post-inc-chain-threshold", cl::Hidden, cl::init(4),
-    cl::desc("Limit the number of post-inc instructions in a chain."));
-
-static cl::opt<bool> PreferPostIncStore(
-    "prefer-post-inc-store", cl::Hidden, cl::init(true),
-    cl::desc("Prefer post-inc store in a list of loads and stores."));
-
-namespace llvm {
-void initializeHexagonPostIncOptPass(PassRegistry &);
-FunctionPass *createHexagonPostIncOpt();
-} // namespace llvm
-
-namespace {
-
-class HexagonPostIncOpt : public MachineFunctionPass {
-  MachineLoopInfo *MLI = nullptr;
-  const HexagonInstrInfo *HII = nullptr;
-  const TargetRegisterInfo *TRI = nullptr;
-  const MachineRegisterInfo *MRI = nullptr;
-  const HexagonSubtarget *HST = nullptr;
-
-public:
-  static char ID;
-
-  HexagonPostIncOpt() : MachineFunctionPass(ID) {
-    initializeHexagonPostIncOptPass(*PassRegistry::getPassRegistry());
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<AAResultsWrapperPass>();
-    AU.addPreserved<AAResultsWrapperPass>();
-    AU.addRequired<MachineLoopInfo>();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
-  StringRef getPassName() const override { return "Hexagon Post-Inc-Opt Pass"; }
-
-  bool runOnMachineFunction(MachineFunction &Fn) override;
-
-private:
-  bool translatePostIncsInLoop(MachineBasicBlock &MBB);
-  void replacePostIncWithBaseOffset(MachineBasicBlock &MBB) const;
-  void replacePostIncWithBaseOffset(MachineInstr &MI) const;
-  bool isPostIncInsn(MachineInstr &MI) const;
-  void foldAdds(MachineBasicBlock &MBB) const;
-  void updateBaseAndOffset(MachineInstr &MI, MachineInstr &AddMI) const;
-  void removeDeadInstructions(MachineBasicBlock &MBB) const;
-
-  void generatePostInc(MachineBasicBlock &MBB);
-  bool canReplaceWithPostInc(MachineInstr *MI, MachineInstr *AddMI) const;
-  void replaceWithPostInc(MachineInstr *MI, MachineInstr *AddMI) const;
-
-  bool isValidOffset(const MachineInstr &MI, int64_t Offset) const;
-  bool isValidPostIncValue(const MachineInstr &MI, int IncVal) const;
-};
-
-class HexagonPostIncOptSchedDAG : public ScheduleDAGInstrs {
-  HexagonPostIncOpt &Pass;
-
-public:
-  HexagonPostIncOptSchedDAG(HexagonPostIncOpt &P, MachineFunction &MF,
-                            MachineLoopInfo *MLI)
-      : ScheduleDAGInstrs(MF, MLI, false), Pass(P){};
-  void schedule() override;
-  ScheduleDAGTopologicalSort &getTopo() { return Topo; };
-};
-
-} // End anonymous namespace.
-
-char HexagonPostIncOpt::ID = 0;
-
-INITIALIZE_PASS_BEGIN(HexagonPostIncOpt, DEBUG_TYPE,
-                      "Hexagon Post-Inc-Opt Pass", false, false)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(HexagonPostIncOpt, DEBUG_TYPE, "Hexagon Post-Inc-Opt Pass",
-                    false, false)
-
-/// Return true if MIA dominates MIB.
-static bool dominates(MachineInstr *MIA, MachineInstr *MIB) {
-  if (MIA->getParent() != MIB->getParent())
-    return false; // Don't know since machine dominator tree is out of date.
-
-  MachineBasicBlock *MBB = MIA->getParent();
-  MachineBasicBlock::iterator I = MBB->instr_begin();
-  // Iterate over the basic block until MIA or MIB is found.
-  for (; &*I != MIA && &*I != MIB; ++I)
-    ;
-
-  // MIA dominates MIB if MIA is found first.
-  return &*I == MIA;
-}
-
-// Return the Phi register value that comes from the loop block.
-static unsigned getLoopPhiReg(MachineInstr *Phi, MachineBasicBlock *LoopBB) {
-  for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2)
-    if (Phi->getOperand(i + 1).getMBB() == LoopBB)
-      return Phi->getOperand(i).getReg();
-  return UINT_MAX;
-}
-
-static bool isAddWithImmValue(const MachineInstr &MI) {
-  // FIXME: For now, only deal with adds that have strict immediate values.
-  // Some A2_addi instructions can be of the form.
-  // %338:intregs = A2_addi %7:intregs, @_ZL7phs_tbl + 16
-  return MI.getOpcode() == Hexagon::A2_addi && MI.getOperand(2).isImm();
-}
-
-// Compute the number of 'real' instructions in the basic block by
-// ignoring terminators.
-static unsigned getBasicBlockSize(MachineBasicBlock &MBB) {
-  unsigned size = 0;
-  for (auto &I : make_range(MBB.begin(), MBB.getFirstTerminator()))
-    if (!I.isDebugInstr())
-      size++;
-  return size;
-}
-
-// Setup Post increment Schedule DAG.
-static void initPISchedDAG(HexagonPostIncOptSchedDAG &PIDAG,
-                           MachineBasicBlock &MBB) {
-  PIDAG.startBlock(&MBB);
-  PIDAG.enterRegion(&MBB, MBB.begin(), MBB.getFirstTerminator(),
-                    getBasicBlockSize(MBB));
-  // Build the graph.
-  PIDAG.schedule();
-  // exitRegion() is an empty function in base class. So, safe to call it here.
-  PIDAG.exitRegion();
-}
-
-// Check if post-increment candidate has any memory dependence on any
-// instruction in the chain.
-static bool hasMemoryDependency(SUnit *PostIncSU,
-                                SmallVector<MachineInstr *, 4> &UseList) {
-
-  // FIXME: Fine tune the order dependence. Probably can only consider memory
-  // related OrderKind.
-  for (auto &Dep : PostIncSU->Succs)
-    if (Dep.getKind() == SDep::Order)
-      if (std::find(UseList.begin(), UseList.end(),
-                    Dep.getSUnit()->getInstr()) != UseList.end())
-        return true;
-
-  return false;
-}
-
-// Fold an add with immediate into either an add or a load or a store.
-void HexagonPostIncOpt::foldAdds(MachineBasicBlock &MBB) const {
-  LLVM_DEBUG(dbgs() << "#Fold add instructions in this block.\n");
-  for (auto &MI : make_range(MBB.getFirstNonPHI(), MBB.getFirstTerminator())) {
-    if (!isAddWithImmValue(MI))
-      continue;
-    unsigned DefReg = MI.getOperand(0).getReg();
-    unsigned AddReg = MI.getOperand(1).getReg();
-    int64_t AddImm = MI.getOperand(2).getImm();
-
-    SmallVector<MachineInstr *, 4> UseList;
-    // Gather the uses of add instruction's def reg.
-    for (auto &MO : make_range(MRI->use_begin(DefReg), MRI->use_end())) {
-      MachineInstr *UseMI = MO.getParent();
-      // Deal with only the instuctions that belong to this block.
-      // If we cross this block, the generation of post-increment logic
-      // will not be able to transform to post-inc due to dominance.
-      if (UseMI->getParent() == &MBB)
-        UseList.push_back(UseMI);
-    }
-
-    if (UseList.empty())
-      continue;
-
-    LLVM_DEBUG({
-      dbgs() << "Current instruction considered for folding \n";
-      MI.dump();
-    });
-
-    for (auto UseMI : UseList) {
-      if (isAddWithImmValue(*UseMI)) {
-        int64_t NewImm = AddImm + UseMI->getOperand(2).getImm();
-        // Fold if the new immediate is with in the range.
-        if (HII->isValidOffset(UseMI->getOpcode(), NewImm, TRI, false)) {
-          LLVM_DEBUG({
-            UseMI->dump();
-            dbgs() << "\t is folded in to \n";
-          });
-          UseMI->getOperand(1).setReg(AddReg);
-          UseMI->getOperand(2).setImm(NewImm);
-          LLVM_DEBUG(UseMI->dump());
-        }
-      } else if (HII->isBaseImmOffset(*UseMI)) {
-        LLVM_DEBUG({
-          UseMI->dump();
-          dbgs() << "\t is folded in to \n";
-        });
-        updateBaseAndOffset(*UseMI, MI);
-        LLVM_DEBUG(UseMI->dump());
-      }
-      LLVM_DEBUG(dbgs() << "\n");
-    }
-  }
-  removeDeadInstructions(MBB);
-  LLVM_DEBUG(dbgs() << "#End of the fold instructions logic.\n");
-}
-
-void HexagonPostIncOpt::updateBaseAndOffset(MachineInstr &MI,
-                                            MachineInstr &AddMI) const {
-  assert(HII->isBaseImmOffset(MI));
-  unsigned BasePos, OffsetPos;
-  if (!HII->getBaseAndOffsetPosition(MI, BasePos, OffsetPos))
-    return;
-
-  MachineOperand &OffsetOp = MI.getOperand(OffsetPos);
-  MachineOperand &BaseOp = MI.getOperand(BasePos);
-
-  if (BaseOp.getReg() != AddMI.getOperand(0).getReg())
-    return;
-
-  unsigned IncBase = AddMI.getOperand(1).getReg();
-  int64_t IncValue = AddMI.getOperand(2).getImm();
-
-  int64_t NewOffset = OffsetOp.getImm() + IncValue;
-  if (!isValidOffset(MI, NewOffset))
-    return;
-
-  OffsetOp.setImm(NewOffset);
-  BaseOp.setReg(IncBase);
-}
-
-void HexagonPostIncOpt::removeDeadInstructions(MachineBasicBlock &MBB) const {
-  // For MBB, check that the value defined by each instruction is used.
-  // If not, delete it.
-  for (MachineBasicBlock::reverse_instr_iterator MI = MBB.instr_rbegin(),
-                                                 ME = MBB.instr_rend();
-       MI != ME;) {
-    // From DeadMachineInstructionElem. Don't delete inline assembly.
-    if (MI->isInlineAsm()) {
-      ++MI;
-      continue;
-    }
-    bool SawStore = false;
-    // Check if it's safe to remove the instruction due to side effects.
-    if (!MI->isSafeToMove(nullptr, SawStore)) {
-      ++MI;
-      continue;
-    }
-    unsigned Uses = 0;
-    for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
-                                    MOE = MI->operands_end();
-         MOI != MOE; ++MOI) {
-      if (!MOI->isReg() || !MOI->isDef())
-        continue;
-      unsigned reg = MOI->getReg();
-      // Assume physical registers are used.
-      if (Register::isPhysicalRegister(reg)) {
-        Uses++;
-        continue;
-      }
-      if (MRI->use_begin(reg) != MRI->use_end())
-        Uses++;
-    }
-    if (!Uses) {
-      MI++->eraseFromParent();
-      continue;
-    }
-    ++MI;
-  }
-}
-
-bool HexagonPostIncOpt::isPostIncInsn(MachineInstr &MI) const {
-  // Predicated post-increments are not yet handled. (ISel is not generating
-  // them yet). Circular buffer instructions should not be handled.
-  return (HII->isPostIncWithImmOffset(MI) && !HII->isPredicated(MI) &&
-          !HII->isCircBufferInstr(MI));
-}
-
-/// For instructions with a base and offset, return true if the new Offset
-/// is a valid value with the correct alignment.
-bool HexagonPostIncOpt::isValidOffset(const MachineInstr &MI,
-                                      int64_t Offset) const {
-  if (!HII->isValidOffset(MI.getOpcode(), Offset, TRI, false))
-    return false;
-  unsigned AlignMask = HII->getMemAccessSize(MI) - 1;
-  return (Offset & AlignMask) == 0;
-}
-
-bool HexagonPostIncOpt::isValidPostIncValue(const MachineInstr &MI,
-                                            int IncVal) const {
-  unsigned AlignMask = HII->getMemAccessSize(MI) - 1;
-  if ((IncVal & AlignMask) != 0)
-    return false;
-
-  // Number of total bits in the instruction used to encode Inc value.
-  unsigned IncBits = 4;
-  // For HVX instructions, the offset is 3.
-  if (HexagonII::isCVI(MI.getDesc()))
-    IncBits = 3;
-
-  IncBits += Log2_32(HII->getMemAccessSize(MI));
-  if (HII->getMemAccessSize(MI) > 8)
-    IncBits = 16;
-
-  int MinValidVal = -1U << (IncBits - 1);
-  int MaxValidVal = ~(-1U << (IncBits - 1));
-  return (IncVal >= MinValidVal && IncVal <= MaxValidVal);
-}
-
-void HexagonPostIncOptSchedDAG::schedule() {
-  AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
-  buildSchedGraph(AA);
-}
-
-// Replace post-increment operations with base+offset counterpart.
-void HexagonPostIncOpt::replacePostIncWithBaseOffset(
-    MachineBasicBlock &MBB) const {
-  LLVM_DEBUG(dbgs() << "#Replacing post-increment instructions with "
-                       "base+offset counterparts.\n");
-
-  SmallVector<MachineInstr *, 4> MIList;
-  for (auto &MI : make_range(MBB.getFirstNonPHI(), MBB.getFirstTerminator())) {
-    // Check for eligible post-inc candidates.
-    if (!isPostIncInsn(MI))
-      continue;
-    MIList.push_back(&MI);
-  }
-
-  for (auto MI : MIList)
-    replacePostIncWithBaseOffset(*MI);
-
-  LLVM_DEBUG(dbgs() << "#Done with replacing post-increment instructions.\n");
-}
-
-void HexagonPostIncOpt::replacePostIncWithBaseOffset(MachineInstr &MI) const {
-  short NewOpcode = HII->changeAddrMode_pi_io(MI.getOpcode());
-  if (NewOpcode < 0)
-    return;
-
-  unsigned BasePos = 0, OffsetPos = 0;
-  if (!HII->getBaseAndOffsetPosition(MI, BasePos, OffsetPos))
-    return;
-  const MachineOperand &PostIncOffset = MI.getOperand(OffsetPos);
-  const MachineOperand &PostIncBase = MI.getOperand(BasePos);
-
-  MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
-  MachineOperand *PostIncDest;
-  MachineInstrBuilder MIB;
-  if (MI.mayLoad()) {
-    PostIncDest = &MI.getOperand(1);
-    const MachineOperand &LDValue = MI.getOperand(0);
-    MIB = BuildMI(MBB, MI, DL, HII->get(NewOpcode));
-    MIB.add(LDValue).add(PostIncBase).addImm(0);
-  } else {
-    PostIncDest = &MI.getOperand(0);
-    const MachineOperand &STValue = MI.getOperand(3);
-    MIB = BuildMI(MBB, MI, DL, HII->get(NewOpcode));
-    MIB.add(PostIncBase).addImm(0).add(STValue);
-  }
-
-  // Transfer memoperands.
-  MIB->cloneMemRefs(*MBB.getParent(), MI);
-
-  // Create an add instruction for the post-inc addition of offset.
-  MachineInstrBuilder MIBA = BuildMI(MBB, MI, DL, HII->get(Hexagon::A2_addi));
-  MIBA.add(*PostIncDest).add(PostIncBase).add(PostIncOffset);
-
-  LLVM_DEBUG({
-    dbgs() << "\n";
-    MI.dump();
-    dbgs() << "\tis tranformed to \n";
-    MIB->dump();
-    MIBA->dump();
-    dbgs() << "\n\n";
-  });
-
-  MI.eraseFromParent();
-}
-
-void HexagonPostIncOpt::generatePostInc(MachineBasicBlock &MBB) {
-  LLVM_DEBUG(dbgs() << "# Generate Post-inc and update uses if needed.\n");
-  MachineBasicBlock::iterator MII = MBB.getFirstNonPHI();
-  MachineBasicBlock::iterator MIE = MBB.instr_begin();
-  bool isOK = true;
-  while (MII != MIE) {
-    MachineInstr *Phi = &*std::prev(MII);
-    MII = std::prev(MII);
-    unsigned LoopVal = getLoopPhiReg(Phi, &MBB);
-    if (LoopVal == UINT_MAX)
-      continue;
-    MachineInstr *LoopInst = MRI->getVRegDef(LoopVal);
-    if (!isAddWithImmValue(*LoopInst))
-      continue;
-
-    if (LoopInst->getOpcode() != Hexagon::A2_addi)
-      continue;
-
-    unsigned AddReg = LoopInst->getOperand(1).getReg();
-    int64_t AddImm = LoopInst->getOperand(2).getImm();
-    SmallVector<MachineInstr *, 4> UseList;
-    MachineInstr *PostIncCandidate = nullptr;
-
-    // Find the probable candidates for Post-increment instruction.
-    SmallVector<MachineInstr *, 4> CandList;
-    for (auto &MO : make_range(MRI->use_begin(AddReg), MRI->use_end())) {
-      MachineInstr *UseMI = MO.getParent();
-
-      if (UseMI == LoopInst)
-        continue;
-
-      if (!dominates(UseMI, LoopInst)) {
-        isOK = false;
-        break;
-      }
-      const MachineOperand *BaseOp = nullptr;
-      int64_t Offset;
-      bool OffsetIsScalable;
-      if (!HII->isBaseImmOffset(*UseMI) ||
-          !HII->getMemOperandWithOffset(*UseMI, BaseOp, Offset,
-                                        OffsetIsScalable, TRI)) {
-        isOK = false;
-        break;
-      }
-      int64_t NewOffset = Offset - AddImm;
-      if (!isValidOffset(*UseMI, NewOffset) || !BaseOp->isReg() ||
-          BaseOp->getReg() != AddReg) {
-        isOK = false;
-        break;
-      }
-      if (OffsetIsScalable) {
-        isOK = false;
-        break;
-      }
-      if (Offset == 0) {
-        // If you have stores in the chain, make sure they are in the beginning
-        // of the list. Eg: LD, LD, ST, ST will end up as LD, LD, PostInc_ST,
-        // ST.
-        if (UseMI->mayStore() && PreferPostIncStore)
-          CandList.insert(CandList.begin(), UseMI);
-        else
-          CandList.push_back(UseMI);
-        continue;
-      }
-      UseList.push_back(UseMI);
-    }
-
-    if (!isOK)
-      continue;
-
-    for (auto MI : CandList) {
-      if (!PostIncCandidate)
-        PostIncCandidate = MI;
-      // Push the rest of the list for updation.
-      else
-        UseList.push_back(MI);
-    }
-
-    // If a candidate is found, replace it with the post-inc instruction.
-    // Also, adjust offset for other uses as needed.
-    if (!PostIncCandidate || !canReplaceWithPostInc(PostIncCandidate, LoopInst))
-      continue;
-
-    // Logic to determine what the base register to be.
-    // There are two choices:
-    //   1. New address register after we updated the post-increment candidate.
-    //      v2,v3 = post_load v1, 4
-    //      v3 is the choice here.
-    //   2. The base register we used in post-increment candidate.
-    //      v2,v3 = post_load v1, 4
-    //      v1 is the choice here.
-    // Use v3  if there is a memory dependence between post-inc instruction and
-    // any other instruction in the chain.
-    // FIXME: We can do some complex DAG analysis based off height and depth and
-    // selectively update other instructions in the chain. Use v3 if there are
-    // more instructions in the chain, otherwise we will end up increasing the
-    // height of the DAG resulting in more spills. By default we have a
-    // threshold controlled by the option "post-inc-chain-threshold" which is
-    // set to 4. v1 is preferred as we can packetize two memory operations in a
-    // single packet in scalar core. But it heavily depends on the structure of
-    // DAG.
-    bool UpdateBaseToNew = false;
-
-    // Do not bother to build a DAG and analyze if the Use list is empty.
-    if (!UseList.empty()) {
-      MachineFunction *MF = MBB.getParent();
-      // Setup the Post-inc schedule DAG.
-      HexagonPostIncOptSchedDAG PIDAG(*this, *MF, MLI);
-      initPISchedDAG(PIDAG, MBB);
-      SUnit *SU = PIDAG.getSUnit(PostIncCandidate);
-      if (hasMemoryDependency(SU, UseList) ||
-          UseList.size() >= PostIncChainThreshold)
-        UpdateBaseToNew = true;
-    }
-
-    if (UpdateBaseToNew) {
-      LLVM_DEBUG(dbgs() << "The heuristic determines to update the uses of the "
-                           "base register of post-increment\n");
-      for (auto UseMI : UseList) {
-        if (!dominates(PostIncCandidate, UseMI))
-          continue;
-        unsigned BasePos, OffsetPos;
-        if (HII->getBaseAndOffsetPosition(*UseMI, BasePos, OffsetPos)) {
-          // New offset has already been validated; no need to do it again.
-          LLVM_DEBUG({
-            UseMI->dump();
-            dbgs() << "\t is transformed to \n";
-          });
-          int64_t NewOffset = UseMI->getOperand(OffsetPos).getImm() - AddImm;
-          UseMI->getOperand(OffsetPos).setImm(NewOffset);
-          UseMI->getOperand(BasePos).setReg(LoopVal);
-          LLVM_DEBUG(UseMI->dump());
-        }
-      }
-    }
-    replaceWithPostInc(PostIncCandidate, LoopInst);
-  }
-  LLVM_DEBUG(dbgs() << "# End of generation of Post-inc.\n");
-}
-
-bool HexagonPostIncOpt::canReplaceWithPostInc(MachineInstr *MI,
-                                              MachineInstr *AddMI) const {
-  if (HII->changeAddrMode_io_pi(MI->getOpcode()) < 0)
-    return false;
-  assert(AddMI->getOpcode() == Hexagon::A2_addi);
-  return isValidPostIncValue(*MI, AddMI->getOperand(2).getImm());
-}
-
-void HexagonPostIncOpt::replaceWithPostInc(MachineInstr *MI,
-                                           MachineInstr *AddMI) const {
-  short NewOpcode = HII->changeAddrMode_io_pi(MI->getOpcode());
-  assert(NewOpcode >= 0 &&
-         "Couldn't change base offset to post-increment form");
-
-  MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
-  const MachineOperand &IncDest = AddMI->getOperand(0);
-  const MachineOperand &IncBase = AddMI->getOperand(1);
-  const MachineOperand &IncValue = AddMI->getOperand(2);
-  MachineInstrBuilder MIB;
-  LLVM_DEBUG({
-    dbgs() << "\n\n";
-    MI->dump();
-    dbgs() << "\t is tranformed to post-inc form of \n";
-  });
-
-  if (MI->mayLoad()) {
-    const MachineOperand &LDValue = MI->getOperand(0);
-    MIB = BuildMI(MBB, *MI, DL, HII->get(NewOpcode));
-    MIB.add(LDValue).add(IncDest).add(IncBase).add(IncValue);
-  } else {
-    const MachineOperand &STValue = MI->getOperand(2);
-    MIB = BuildMI(MBB, *MI, DL, HII->get(NewOpcode));
-    MIB.add(IncDest).add(IncBase).add(IncValue).add(STValue);
-  }
-
-  // Transfer memoperands.
-  MIB->cloneMemRefs(*MBB.getParent(), *MI);
-
-  LLVM_DEBUG({
-    MIB->dump();
-    dbgs() << "As a result this add instruction is erased.\n";
-    AddMI->dump();
-  });
-
-  MI->eraseFromParent();
-  AddMI->eraseFromParent();
-}
-
-bool HexagonPostIncOpt::translatePostIncsInLoop(MachineBasicBlock &MBB) {
-  // Algorithm:
-  // 1. Replace all the post-inc instructions with Base+Offset instruction and
-  // an add instruction in this block.
-  // 2. Fold all the adds in to respective uses.
-  // 3. Generate post-increment instructions and update the uses of the base
-  // register if needed based on constraints.
-
-  replacePostIncWithBaseOffset(MBB);
-  foldAdds(MBB);
-  generatePostInc(MBB);
-  return true;
-}
-
-bool HexagonPostIncOpt::runOnMachineFunction(MachineFunction &MF) {
-
-  // Skip pass if requested.
-  if (skipFunction(MF.getFunction()))
-    return false;
-
-  // Get Target Information.
-  MLI = &getAnalysis<MachineLoopInfo>();
-  HST = &MF.getSubtarget<HexagonSubtarget>();
-  TRI = HST->getRegisterInfo();
-  MRI = &MF.getRegInfo();
-  HII = HST->getInstrInfo();
-
-  // Skip this pass for TinyCore.
-  // Tiny core allwos partial post increment operations - This constraint can
-  // be imposed inside the pass. In a chain of post-increments, the first can
-  // be post-increment, rest can be adjusted to base+offset (these are
-  // inexpensive in most of the cases);
-  if (HST->isTinyCore())
-    return false;
-
-  LLVM_DEBUG({
-    dbgs() << "Begin: Hexagon Post-Inc-Opt Pass.\n";
-    dbgs() << "Function: " << MF.getName() << "\n";
-  });
-  bool Change = false;
-  std::vector<MachineBasicBlock *> MLBB;
-  for (auto &BB : MF) {
-    // Check if this Basic Block belongs to any loop.
-    auto *LI = MLI->getLoopFor(&BB);
-    // We only deal with inner-most loops that has one block.
-    if (LI && LI->getBlocks().size() == 1) {
-      MachineBasicBlock *MBB = LI->getHeader();
-      // Do not traverse blocks that are already visited.
-      if (std::find(MLBB.begin(), MLBB.end(), MBB) != MLBB.end())
-        continue;
-
-      MLBB.push_back(MBB);
-
-      LLVM_DEBUG(dbgs() << "\n\t Basic Block: " << MBB->getName() << "\n");
-      Change |= translatePostIncsInLoop(*MBB);
-    }
-  }
-  LLVM_DEBUG(dbgs() << "End: Hexagon Post-Inc-Opt Pass\n");
-  return Change;
-}
-
-FunctionPass *llvm::createHexagonPostIncOpt() {
-  return new HexagonPostIncOpt();
-}
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 170684276ac3..7d7728633939 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -129,10 +129,6 @@ static cl::opt<bool> EnableInstSimplify("hexagon-instsimplify", cl::Hidden,
                                         cl::init(true),
                                         cl::desc("Enable instsimplify"));
 
-static cl::opt<bool> DisableHexagonPostIncOpt(
-    "hexagon-postinc-opt", cl::Hidden,
-    cl::desc("Disable Hexagon post-increment optimization"));
-
 /// HexagonTargetMachineModule - Note that this is used on hosts that
 /// cannot link in a library unless there are references into the
 /// library.  In particular, it seems that it is not possible to get
@@ -171,7 +167,6 @@ namespace llvm {
   void initializeHexagonNewValueJumpPass(PassRegistry&);
   void initializeHexagonOptAddrModePass(PassRegistry&);
   void initializeHexagonPacketizerPass(PassRegistry&);
-  void initializeHexagonPostIncOptPass(PassRegistry &);
   void initializeHexagonRDFOptPass(PassRegistry&);
   void initializeHexagonSplitDoubleRegsPass(PassRegistry&);
   void initializeHexagonTfrCleanupPass(PassRegistry &);
@@ -205,7 +200,6 @@ namespace llvm {
   FunctionPass *createHexagonOptimizeSZextends();
   FunctionPass *createHexagonPacketizer(bool Minimal);
   FunctionPass *createHexagonPeephole();
-  FunctionPass *createHexagonPostIncOpt();
   FunctionPass *createHexagonRDFOpt();
   FunctionPass *createHexagonSplitConst32AndConst64();
   FunctionPass *createHexagonSplitDoubleRegs();
@@ -237,7 +231,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTarget() {
   initializeHexagonNewValueJumpPass(PR);
   initializeHexagonOptAddrModePass(PR);
   initializeHexagonPacketizerPass(PR);
-  initializeHexagonPostIncOptPass(PR);
   initializeHexagonRDFOptPass(PR);
   initializeHexagonSplitDoubleRegsPass(PR);
   initializeHexagonVectorCombineLegacyPass(PR);
@@ -266,7 +259,6 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
       TLOF(std::make_unique<HexagonTargetObjectFile>()) {
   initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry());
   initializeHexagonTfrCleanupPass(*PassRegistry::getPassRegistry());
-  initializeHexagonPostIncOptPass(*PassRegistry::getPassRegistry());
   initAsmInfo();
 }
 
@@ -443,11 +435,6 @@ void HexagonPassConfig::addPreRegAlloc() {
     if (!DisableHardwareLoops)
       addPass(createHexagonHardwareLoops());
   }
-
-  if (TM->getOptLevel() >= CodeGenOptLevel::Aggressive)
-    if (!DisableHexagonPostIncOpt)
-      addPass(createHexagonPostIncOpt());
-
   if (TM->getOptLevel() >= CodeGenOptLevel::Default)
     addPass(&MachinePipelinerID);
 }
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index 98404121bda0..ca982696b060 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -18,7 +18,6 @@
 
 #include "HexagonDepITypes.h"
 #include "MCTargetDesc/HexagonMCTargetDesc.h"
-#include "llvm/MC/MCInstrDesc.h"
 
 namespace llvm {
 
@@ -49,7 +48,7 @@ namespace HexagonII {
 
   // MCInstrDesc TSFlags
   // *** Must match HexagonInstrFormat*.td ***
-  enum HexagonTSFlagsVal {
+  enum {
     // This 7-bit field describes the insn type.
     TypePos = 0,
     TypeMask = 0x7f,
@@ -174,11 +173,6 @@ namespace HexagonII {
     hasUnaryRestrictionMask = 0x1,
   };
 
-  inline unsigned getTSFlags(const MCInstrDesc &MID, HexagonTSFlagsVal Pos,
-                             unsigned Mask) {
-    return (MID.TSFlags >> Pos) & Mask;
-  }
-
   // *** The code above must match HexagonInstrFormat*.td *** //
 
   // Hexagon specific MO operand flag mask.
@@ -281,10 +275,6 @@ namespace HexagonII {
     INST_ICLASS_ALU32_3   = 0xf0000000
   };
 
-  inline bool isCVI(const MCInstrDesc &MID) {
-    return getTSFlags(MID, isCVIPos, isCVIMask) != 0;
-  }
-
   LLVM_ATTRIBUTE_UNUSED
   static unsigned getMemAccessSizeInBytes(MemAccessSize S) {
     switch (S) {
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 76c1a14fe015..3324dd2e8fc2 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -968,6 +968,28 @@ static SDValue checkIntrinsicImmArg(SDValue Op, unsigned ImmOp,
   return SDValue();
 }
 
+static SDValue checkAndModifyXVPERMI_QIntrinsicImmArg(SDValue Op,
+                                                      SelectionDAG &DAG) {
+  SDValue Op3 = Op->getOperand(3);
+  uint64_t Imm = Op3->getAsZExtVal();
+  // Check the range of ImmArg.
+  if (!isUInt<8>(Imm)) {
+    DAG.getContext()->emitError(Op->getOperationName(0) +
+                                ": argument out of range.");
+    return DAG.getNode(ISD::UNDEF, SDLoc(Op), Op.getValueType());
+  }
+
+  // For instruction xvpermi.q, only [1:0] and [5:4] bits of operands[3]
+  // are used. The unused bits in operands[3] need to be set to 0 to avoid
+  // causing undefined behavior on LA464.
+  if ((Imm & 0x33) != Imm) {
+    Op3 = DAG.getTargetConstant(Imm & 0x33, SDLoc(Op), Op3.getValueType());
+    DAG.UpdateNodeOperands(Op.getNode(), Op->getOperand(0), Op->getOperand(1),
+                           Op->getOperand(2), Op3);
+  }
+  return SDValue();
+}
+
 SDValue
 LoongArchTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                  SelectionDAG &DAG) const {
@@ -1225,13 +1247,14 @@ LoongArchTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::loongarch_lsx_vextrins_d:
   case Intrinsic::loongarch_lasx_xvshuf4i_d:
   case Intrinsic::loongarch_lasx_xvpermi_w:
-  case Intrinsic::loongarch_lasx_xvpermi_q:
   case Intrinsic::loongarch_lasx_xvbitseli_b:
   case Intrinsic::loongarch_lasx_xvextrins_b:
   case Intrinsic::loongarch_lasx_xvextrins_h:
   case Intrinsic::loongarch_lasx_xvextrins_w:
   case Intrinsic::loongarch_lasx_xvextrins_d:
     return checkIntrinsicImmArg<8>(Op, 3, DAG);
+  case Intrinsic::loongarch_lasx_xvpermi_q:
+    return checkAndModifyXVPERMI_QIntrinsicImmArg(Op, DAG);
   case Intrinsic::loongarch_lsx_vrepli_b:
   case Intrinsic::loongarch_lsx_vrepli_h:
   case Intrinsic::loongarch_lsx_vrepli_w:
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 27d7f0f261d1..adfcea736158 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -1255,7 +1255,9 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
     emitRRI(Mips::SD, GPReg, Mips::SP, RegOrOffset, SMLoc(), &STI);
   }
 
-  if (getABI().IsN32()) {
+#if 0
+  // We haven't support -mabicalls -mno-shared yet.
+  if (-mno-shared) {
     MCSymbol *GPSym = MCA.getContext().getOrCreateSymbol("__gnu_local_gp");
     const MipsMCExpr *HiExpr = MipsMCExpr::create(
         MipsMCExpr::MEK_HI, MCSymbolRefExpr::create(GPSym, MCA.getContext()),
@@ -1273,6 +1275,7 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
 
     return;
   }
+#endif
 
   const MipsMCExpr *HiExpr = MipsMCExpr::createGpOff(
       MipsMCExpr::MEK_HI, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
@@ -1288,8 +1291,11 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
   emitRRX(Mips::ADDiu, GPReg, GPReg, MCOperand::createExpr(LoExpr), SMLoc(),
           &STI);
 
-  // daddu  $gp, $gp, $funcreg
-  emitRRR(Mips::DADDu, GPReg, GPReg, RegNo, SMLoc(), &STI);
+  // (d)addu  $gp, $gp, $funcreg
+  if (getABI().IsN32())
+    emitRRR(Mips::ADDu, GPReg, GPReg, RegNo, SMLoc(), &STI);
+  else
+    emitRRR(Mips::DADDu, GPReg, GPReg, RegNo, SMLoc(), &STI);
 }
 
 void MipsTargetELFStreamer::emitDirectiveCpreturn(unsigned SaveLocation,
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index b2812f87914d..97e830cec27c 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -4128,14 +4128,18 @@ MipsTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
     case 'd': // Address register. Same as 'r' unless generating MIPS16 code.
     case 'y': // Same as 'r'. Exists for compatibility.
     case 'r':
-      if (VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8 || VT == MVT::i1) {
+      if ((VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8 ||
+           VT == MVT::i1) ||
+          (VT == MVT::f32 && Subtarget.useSoftFloat())) {
         if (Subtarget.inMips16Mode())
           return std::make_pair(0U, &Mips::CPU16RegsRegClass);
         return std::make_pair(0U, &Mips::GPR32RegClass);
       }
-      if (VT == MVT::i64 && !Subtarget.isGP64bit())
+      if ((VT == MVT::i64 || (VT == MVT::f64 && Subtarget.useSoftFloat())) &&
+          !Subtarget.isGP64bit())
         return std::make_pair(0U, &Mips::GPR32RegClass);
-      if (VT == MVT::i64 && Subtarget.isGP64bit())
+      if ((VT == MVT::i64 || (VT == MVT::f64 && Subtarget.useSoftFloat())) &&
+          Subtarget.isGP64bit())
         return std::make_pair(0U, &Mips::GPR64RegClass);
       // This will generate an error message
       return std::make_pair(0U, nullptr);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 0c98642748d4..dde1882f5eea 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2610,7 +2610,7 @@ static SDValue convertToScalableVector(EVT VT, SDValue V, SelectionDAG &DAG,
   assert(V.getValueType().isFixedLengthVector() &&
          "Expected a fixed length vector operand!");
   SDLoc DL(V);
-  SDValue Zero = DAG.getConstant(0, DL, Subtarget.getXLenVT());
+  SDValue Zero = DAG.getVectorIdxConstant(0, DL);
   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
 }
 
@@ -3472,7 +3472,7 @@ static SDValue lowerBuildVectorViaDominantValues(SDValue Op, SelectionDAG &DAG,
         continue;
       if (ValueCounts[V] == 1) {
         Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V,
-                          DAG.getConstant(OpIdx.index(), DL, XLenVT));
+                          DAG.getVectorIdxConstant(OpIdx.index(), DL));
       } else {
         // Blend in all instances of this value using a VSELECT, using a
         // mask where each bit signals whether that element is the one
@@ -3688,7 +3688,7 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,
     SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ViaVecVT,
                               DAG.getUNDEF(ViaVecVT),
                               DAG.getConstant(SplatValue, DL, XLenVT),
-                              DAG.getConstant(0, DL, XLenVT));
+                              DAG.getVectorIdxConstant(0, DL));
     if (ViaVecLen != 1)
       Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,
                         MVT::getVectorVT(ViaIntVT, 1), Vec,
@@ -4141,9 +4141,9 @@ static SDValue lowerScalarInsert(SDValue Scalar, SDValue VL, MVT VT,
       }
       if (ExtractedContainerVT.bitsLE(VT))
         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Passthru,
-                           ExtractedVal, DAG.getConstant(0, DL, XLenVT));
+                           ExtractedVal, DAG.getVectorIdxConstant(0, DL));
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtractedVal,
-                         DAG.getConstant(0, DL, XLenVT));
+                         DAG.getVectorIdxConstant(0, DL));
     }
   }
 
@@ -5020,12 +5020,12 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     assert(EvenSrc >= 0 && "Undef source?");
     EvenV = (EvenSrc / Size) == 0 ? V1 : V2;
     EvenV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, EvenV,
-                        DAG.getConstant(EvenSrc % Size, DL, XLenVT));
+                        DAG.getVectorIdxConstant(EvenSrc % Size, DL));
 
     assert(OddSrc >= 0 && "Undef source?");
     OddV = (OddSrc / Size) == 0 ? V1 : V2;
     OddV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, OddV,
-                       DAG.getConstant(OddSrc % Size, DL, XLenVT));
+                       DAG.getVectorIdxConstant(OddSrc % Size, DL));
 
     return getWideningInterleave(EvenV, OddV, DL, DAG, Subtarget);
   }
@@ -6088,7 +6088,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
           return SDValue();
         return DAG.getBitcast(VT, DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, BVT,
                                               DAG.getUNDEF(BVT), Op0,
-                                              DAG.getConstant(0, DL, XLenVT)));
+                                              DAG.getVectorIdxConstant(0, DL)));
       }
       return SDValue();
     }
@@ -6101,7 +6101,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
         return SDValue();
       SDValue BVec = DAG.getBitcast(BVT, Op0);
       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, BVec,
-                         DAG.getConstant(0, DL, XLenVT));
+                         DAG.getVectorIdxConstant(0, DL));
     }
     return SDValue();
   }
@@ -6600,8 +6600,9 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
       // Don't insert undef subvectors.
       if (SubVec.isUndef())
         continue;
-      Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Vec, SubVec,
-                        DAG.getIntPtrConstant(OpIdx.index() * NumOpElts, DL));
+      Vec =
+          DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Vec, SubVec,
+                      DAG.getVectorIdxConstant(OpIdx.index() * NumOpElts, DL));
     }
     return Vec;
   }
@@ -8404,7 +8405,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   if (!EltVT.isInteger()) {
     // Floating-point extracts are handled in TableGen.
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vec,
-                       DAG.getConstant(0, DL, XLenVT));
+                       DAG.getVectorIdxConstant(0, DL));
   }
 
   SDValue Elt0 = DAG.getNode(RISCVISD::VMV_X_S, DL, XLenVT, Vec);
@@ -8837,7 +8838,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
   case Intrinsic::riscv_vfmv_f_s:
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
-                       Op.getOperand(1), DAG.getConstant(0, DL, XLenVT));
+                       Op.getOperand(1), DAG.getVectorIdxConstant(0, DL));
   case Intrinsic::riscv_vmv_v_x:
     return lowerScalarSplat(Op.getOperand(1), Op.getOperand(2),
                             Op.getOperand(3), Op.getSimpleValueType(), DL, DAG,
@@ -9437,15 +9438,15 @@ static SDValue lowerReductionSeq(unsigned RVVOpcode, MVT ResVT,
   SDValue InitialValue = lowerScalarInsert(StartValue, InnerVL, InnerVT, DL,
                                            DAG, Subtarget);
   if (M1VT != InnerVT)
-    InitialValue = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, M1VT,
-                               DAG.getUNDEF(M1VT),
-                               InitialValue, DAG.getConstant(0, DL, XLenVT));
+    InitialValue =
+        DAG.getNode(ISD::INSERT_SUBVECTOR, DL, M1VT, DAG.getUNDEF(M1VT),
+                    InitialValue, DAG.getVectorIdxConstant(0, DL));
   SDValue PassThru = NonZeroAVL ? DAG.getUNDEF(M1VT) : InitialValue;
   SDValue Policy = DAG.getTargetConstant(RISCVII::TAIL_AGNOSTIC, DL, XLenVT);
   SDValue Ops[] = {PassThru, Vec, InitialValue, Mask, VL, Policy};
   SDValue Reduction = DAG.getNode(RVVOpcode, DL, M1VT, Ops);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Reduction,
-                     DAG.getConstant(0, DL, XLenVT));
+                     DAG.getVectorIdxConstant(0, DL));
 }
 
 SDValue RISCVTargetLowering::lowerVECREDUCE(SDValue Op,
@@ -9490,9 +9491,8 @@ SDValue RISCVTargetLowering::lowerVECREDUCE(SDValue Op,
   case ISD::UMIN:
   case ISD::SMAX:
   case ISD::SMIN:
-    MVT XLenVT = Subtarget.getXLenVT();
     StartV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VecEltVT, Vec,
-                         DAG.getConstant(0, DL, XLenVT));
+                         DAG.getVectorIdxConstant(0, DL));
   }
   return lowerReductionSeq(RVVOpcode, Op.getSimpleValueType(), StartV, Vec,
                            Mask, VL, DL, DAG, Subtarget);
@@ -9521,10 +9521,9 @@ getRVVFPReductionOpAndOperands(SDValue Op, SelectionDAG &DAG, EVT EltVT,
                            Op.getOperand(0));
   case ISD::VECREDUCE_FMIN:
   case ISD::VECREDUCE_FMAX: {
-    MVT XLenVT = Subtarget.getXLenVT();
     SDValue Front =
         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Op.getOperand(0),
-                    DAG.getConstant(0, DL, XLenVT));
+                    DAG.getVectorIdxConstant(0, DL));
     unsigned RVVOpc = (Opcode == ISD::VECREDUCE_FMIN)
                           ? RISCVISD::VECREDUCE_FMIN_VL
                           : RISCVISD::VECREDUCE_FMAX_VL;
@@ -9646,14 +9645,14 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     if (OrigIdx == 0 && Vec.isUndef() && VecVT.isFixedLengthVector()) {
       SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT,
                            DAG.getUNDEF(ContainerVT), SubVec,
-                           DAG.getConstant(0, DL, XLenVT));
+                           DAG.getVectorIdxConstant(0, DL));
       SubVec = convertFromScalableVector(VecVT, SubVec, DAG, Subtarget);
       return DAG.getBitcast(Op.getValueType(), SubVec);
     }
 
     SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT,
                          DAG.getUNDEF(ContainerVT), SubVec,
-                         DAG.getConstant(0, DL, XLenVT));
+                         DAG.getVectorIdxConstant(0, DL));
     SDValue Mask =
         getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget).first;
     // Set the vector length to only the number of elements we care about. Note
@@ -9720,17 +9719,24 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     // Extract a subvector equal to the nearest full vector register type. This
     // should resolve to a EXTRACT_SUBREG instruction.
     AlignedExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InterSubVT, Vec,
-                                 DAG.getConstant(AlignedIdx, DL, XLenVT));
+                                 DAG.getVectorIdxConstant(AlignedIdx, DL));
   }
 
   SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InterSubVT,
                        DAG.getUNDEF(InterSubVT), SubVec,
-                       DAG.getConstant(0, DL, XLenVT));
+                       DAG.getVectorIdxConstant(0, DL));
 
   auto [Mask, VL] = getDefaultScalableVLOps(VecVT, DL, DAG, Subtarget);
 
+  ElementCount EndIndex =
+      ElementCount::getScalable(RemIdx) + SubVecVT.getVectorElementCount();
   VL = computeVLMax(SubVecVT, DL, DAG);
 
+  // Use tail agnostic policy if we're inserting over InterSubVT's tail.
+  unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
+  if (EndIndex == InterSubVT.getVectorElementCount())
+    Policy = RISCVII::TAIL_AGNOSTIC;
+
   // If we're inserting into the lowest elements, use a tail undisturbed
   // vmv.v.v.
   if (RemIdx == 0) {
@@ -9744,14 +9750,14 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     VL = DAG.getNode(ISD::ADD, DL, XLenVT, SlideupAmt, VL);
 
     SubVec = getVSlideup(DAG, Subtarget, DL, InterSubVT, AlignedExtract, SubVec,
-                         SlideupAmt, Mask, VL);
+                         SlideupAmt, Mask, VL, Policy);
   }
 
   // If required, insert this subvector back into the correct vector register.
   // This should resolve to an INSERT_SUBREG instruction.
   if (VecVT.bitsGT(InterSubVT))
     SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT, Vec, SubVec,
-                         DAG.getConstant(AlignedIdx, DL, XLenVT));
+                         DAG.getVectorIdxConstant(AlignedIdx, DL));
 
   // We might have bitcast from a mask type: cast back to the original type if
   // required.
@@ -9846,7 +9852,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
                       DAG.getUNDEF(ContainerVT), Vec, SlidedownAmt, Mask, VL);
     // Now we can use a cast-like subvector extract to get the result.
     Slidedown = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, Slidedown,
-                            DAG.getConstant(0, DL, XLenVT));
+                            DAG.getVectorIdxConstant(0, DL));
     return DAG.getBitcast(Op.getValueType(), Slidedown);
   }
 
@@ -9923,7 +9929,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
   // Now the vector is in the right position, extract our final subvector. This
   // should resolve to a COPY.
   Slidedown = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, Slidedown,
-                          DAG.getConstant(0, DL, XLenVT));
+                          DAG.getVectorIdxConstant(0, DL));
 
   // We might have bitcast from a mask type: cast back to the original type if
   // required.
@@ -9964,7 +9970,6 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op,
                                                       SelectionDAG &DAG) const {
   SDLoc DL(Op);
   MVT VecVT = Op.getSimpleValueType();
-  MVT XLenVT = Subtarget.getXLenVT();
 
   assert(VecVT.isScalableVector() &&
          "vector_interleave on non-scalable vector!");
@@ -10030,9 +10035,9 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op,
 
   // Extract the result half of the gather for even and odd
   SDValue Even = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, EvenWide,
-                             DAG.getConstant(0, DL, XLenVT));
+                             DAG.getVectorIdxConstant(0, DL));
   SDValue Odd = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, OddWide,
-                            DAG.getConstant(0, DL, XLenVT));
+                            DAG.getVectorIdxConstant(0, DL));
 
   return DAG.getMergeValues({Even, Odd}, DL);
 }
@@ -10195,10 +10200,10 @@ SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op,
       // FIXME: This is a CONCAT_VECTORS.
       SDValue Res =
           DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT, DAG.getUNDEF(VecVT), Hi,
-                      DAG.getIntPtrConstant(0, DL));
+                      DAG.getVectorIdxConstant(0, DL));
       return DAG.getNode(
           ISD::INSERT_SUBVECTOR, DL, VecVT, Res, Lo,
-          DAG.getIntPtrConstant(LoVT.getVectorMinNumElements(), DL));
+          DAG.getVectorIdxConstant(LoVT.getVectorMinNumElements(), DL));
     }
 
     // Just promote the int type to i16 which will double the LMUL.
@@ -10331,9 +10336,9 @@ RISCVTargetLowering::lowerFixedLengthVectorStoreToRVV(SDValue Op,
   // If the size less than a byte, we need to pad with zeros to make a byte.
   if (VT.getVectorElementType() == MVT::i1 && VT.getVectorNumElements() < 8) {
     VT = MVT::v8i1;
-    StoreVal = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
-                           DAG.getConstant(0, DL, VT), StoreVal,
-                           DAG.getIntPtrConstant(0, DL));
+    StoreVal =
+        DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getConstant(0, DL, VT),
+                    StoreVal, DAG.getVectorIdxConstant(0, DL));
   }
 
   MVT ContainerVT = getContainerForFixedLengthVector(VT);
@@ -12109,7 +12114,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       if (isTypeLegal(BVT)) {
         SDValue BVec = DAG.getBitcast(BVT, Op0);
         Results.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, BVec,
-                                      DAG.getConstant(0, DL, XLenVT)));
+                                      DAG.getVectorIdxConstant(0, DL)));
       }
     }
     break;
@@ -12598,7 +12603,7 @@ static SDValue combineBinOpToReduce(SDNode *N, SelectionDAG &DAG,
   if (ScalarVT != ScalarV.getValueType())
     NewScalarV =
         DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalarVT, DAG.getUNDEF(ScalarVT),
-                    NewScalarV, DAG.getConstant(0, DL, Subtarget.getXLenVT()));
+                    NewScalarV, DAG.getVectorIdxConstant(0, DL));
 
   SDValue Ops[] = {Reduce.getOperand(0), Reduce.getOperand(1),
                    NewScalarV,           Reduce.getOperand(3),
@@ -15248,8 +15253,7 @@ static SDValue performINSERT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
   if (ConcatVT.getVectorElementType() != InVal.getValueType())
     return SDValue();
   unsigned ConcatNumElts = ConcatVT.getVectorNumElements();
-  SDValue NewIdx = DAG.getConstant(Elt % ConcatNumElts, DL,
-                                   EltNo.getValueType());
+  SDValue NewIdx = DAG.getVectorIdxConstant(Elt % ConcatNumElts, DL);
 
   unsigned ConcatOpIdx = Elt / ConcatNumElts;
   SDValue ConcatOp = InVec.getOperand(ConcatOpIdx);
@@ -16449,7 +16453,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
       SDValue Result =
           DAG.getNode(N->getOpcode(), DL, M1VT, M1Passthru, Scalar, VL);
       Result = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Passthru, Result,
-                           DAG.getConstant(0, DL, XLenVT));
+                           DAG.getVectorIdxConstant(0, DL));
       return Result;
     }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index e32cd50be56e..afb24bfb3223 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -500,9 +500,25 @@ Instruction *SPIRVEmitIntrinsics::visitStoreInst(StoreInst &I) {
 }
 
 Instruction *SPIRVEmitIntrinsics::visitAllocaInst(AllocaInst &I) {
+  Value *ArraySize = nullptr;
+  if (I.isArrayAllocation()) {
+    const SPIRVSubtarget *STI = TM->getSubtargetImpl(*I.getFunction());
+    if (!STI->canUseExtension(
+            SPIRV::Extension::SPV_INTEL_variable_length_array))
+      report_fatal_error(
+          "array allocation: this instruction requires the following "
+          "SPIR-V extension: SPV_INTEL_variable_length_array",
+          false);
+    ArraySize = I.getArraySize();
+  }
+
   TrackConstants = false;
   Type *PtrTy = I.getType();
-  auto *NewI = IRB->CreateIntrinsic(Intrinsic::spv_alloca, {PtrTy}, {});
+  auto *NewI =
+      ArraySize
+          ? IRB->CreateIntrinsic(Intrinsic::spv_alloca_array,
+                                 {PtrTy, ArraySize->getType()}, {ArraySize})
+          : IRB->CreateIntrinsic(Intrinsic::spv_alloca, {PtrTy}, {});
   std::string InstName = I.hasName() ? I.getName().str() : "";
   I.replaceAllUsesWith(NewI);
   I.eraseFromParent();
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index 7c5252e8cb37..fe8c909236cd 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -287,6 +287,15 @@ def OpPtrNotEqual: Op<402, (outs ID:$res), (ins TYPE:$resType, ID:$a, ID:$b),
 def OpPtrDiff: Op<403, (outs ID:$res), (ins TYPE:$resType, ID:$a, ID:$b),
                   "$res = OpPtrDiff $resType $a $b">;
 
+// - SPV_INTEL_variable_length_array
+
+def OpVariableLengthArrayINTEL: Op<5818, (outs ID:$res), (ins TYPE:$type, ID:$length),
+                  "$res = OpVariableLengthArrayINTEL $type $length">;
+def OpSaveMemoryINTEL: Op<5819, (outs ID:$res), (ins TYPE:$type),
+                  "$res = OpSaveMemoryINTEL $type">;
+def OpRestoreMemoryINTEL: Op<5820, (outs), (ins ID:$ptr),
+                  "OpRestoreMemoryINTEL $ptr">;
+
 // 3.42.9 Function Instructions
 
 def OpFunction: Op<54, (outs ID:$func),
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 7258d3b4d88e..9b38073ec3bc 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -99,6 +99,10 @@ private:
                   MachineInstr &I) const;
   bool selectStore(MachineInstr &I) const;
 
+  bool selectStackSave(Register ResVReg, const SPIRVType *ResType,
+                       MachineInstr &I) const;
+  bool selectStackRestore(MachineInstr &I) const;
+
   bool selectMemOperation(Register ResVReg, MachineInstr &I) const;
 
   bool selectAtomicRMW(Register ResVReg, const SPIRVType *ResType,
@@ -150,6 +154,8 @@ private:
 
   bool selectOpUndef(Register ResVReg, const SPIRVType *ResType,
                      MachineInstr &I) const;
+  bool selectFreeze(Register ResVReg, const SPIRVType *ResType,
+                    MachineInstr &I) const;
   bool selectIntrinsic(Register ResVReg, const SPIRVType *ResType,
                        MachineInstr &I) const;
   bool selectExtractVal(Register ResVReg, const SPIRVType *ResType,
@@ -165,6 +171,8 @@ private:
 
   bool selectFrameIndex(Register ResVReg, const SPIRVType *ResType,
                         MachineInstr &I) const;
+  bool selectAllocaArray(Register ResVReg, const SPIRVType *ResType,
+                         MachineInstr &I) const;
 
   bool selectBranch(MachineInstr &I) const;
   bool selectBranchCond(MachineInstr &I) const;
@@ -284,6 +292,8 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
     return selectGlobalValue(ResVReg, I);
   case TargetOpcode::G_IMPLICIT_DEF:
     return selectOpUndef(ResVReg, ResType, I);
+  case TargetOpcode::G_FREEZE:
+    return selectFreeze(ResVReg, ResType, I);
 
   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
   case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
@@ -504,6 +514,11 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
   case TargetOpcode::G_FENCE:
     return selectFence(I);
 
+  case TargetOpcode::G_STACKSAVE:
+    return selectStackSave(ResVReg, ResType, I);
+  case TargetOpcode::G_STACKRESTORE:
+    return selectStackRestore(I);
+
   default:
     return false;
   }
@@ -649,6 +664,35 @@ bool SPIRVInstructionSelector::selectStore(MachineInstr &I) const {
   return MIB.constrainAllUses(TII, TRI, RBI);
 }
 
+bool SPIRVInstructionSelector::selectStackSave(Register ResVReg,
+                                               const SPIRVType *ResType,
+                                               MachineInstr &I) const {
+  if (!STI.canUseExtension(SPIRV::Extension::SPV_INTEL_variable_length_array))
+    report_fatal_error(
+        "llvm.stacksave intrinsic: this instruction requires the following "
+        "SPIR-V extension: SPV_INTEL_variable_length_array",
+        false);
+  MachineBasicBlock &BB = *I.getParent();
+  return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpSaveMemoryINTEL))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectStackRestore(MachineInstr &I) const {
+  if (!STI.canUseExtension(SPIRV::Extension::SPV_INTEL_variable_length_array))
+    report_fatal_error(
+        "llvm.stackrestore intrinsic: this instruction requires the following "
+        "SPIR-V extension: SPV_INTEL_variable_length_array",
+        false);
+  if (!I.getOperand(0).isReg())
+    return false;
+  MachineBasicBlock &BB = *I.getParent();
+  return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpRestoreMemoryINTEL))
+      .addUse(I.getOperand(0).getReg())
+      .constrainAllUses(TII, TRI, RBI);
+}
+
 bool SPIRVInstructionSelector::selectMemOperation(Register ResVReg,
                                                   MachineInstr &I) const {
   MachineBasicBlock &BB = *I.getParent();
@@ -1014,6 +1058,46 @@ bool SPIRVInstructionSelector::selectBitreverse(Register ResVReg,
       .constrainAllUses(TII, TRI, RBI);
 }
 
+bool SPIRVInstructionSelector::selectFreeze(Register ResVReg,
+                                            const SPIRVType *ResType,
+                                            MachineInstr &I) const {
+  // There is no way to implement `freeze` correctly without support on SPIR-V
+  // standard side, but we may at least address a simple (static) case when
+  // undef/poison value presence is obvious. The main benefit of even
+  // incomplete `freeze` support is preventing of translation from crashing due
+  // to lack of support on legalization and instruction selection steps.
+  if (!I.getOperand(0).isReg() || !I.getOperand(1).isReg())
+    return false;
+  Register OpReg = I.getOperand(1).getReg();
+  if (MachineInstr *Def = MRI->getVRegDef(OpReg)) {
+    Register Reg;
+    switch (Def->getOpcode()) {
+    case SPIRV::ASSIGN_TYPE:
+      if (MachineInstr *AssignToDef =
+              MRI->getVRegDef(Def->getOperand(1).getReg())) {
+        if (AssignToDef->getOpcode() == TargetOpcode::G_IMPLICIT_DEF)
+          Reg = Def->getOperand(2).getReg();
+      }
+      break;
+    case SPIRV::OpUndef:
+      Reg = Def->getOperand(1).getReg();
+      break;
+    }
+    unsigned DestOpCode;
+    if (Reg.isValid()) {
+      DestOpCode = SPIRV::OpConstantNull;
+    } else {
+      DestOpCode = TargetOpcode::COPY;
+      Reg = OpReg;
+    }
+    return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(DestOpCode))
+        .addDef(I.getOperand(0).getReg())
+        .addUse(Reg)
+        .constrainAllUses(TII, TRI, RBI);
+  }
+  return false;
+}
+
 bool SPIRVInstructionSelector::selectConstVector(Register ResVReg,
                                                  const SPIRVType *ResType,
                                                  MachineInstr &I) const {
@@ -1461,6 +1545,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     break;
   case Intrinsic::spv_alloca:
     return selectFrameIndex(ResVReg, ResType, I);
+  case Intrinsic::spv_alloca_array:
+    return selectAllocaArray(ResVReg, ResType, I);
   case Intrinsic::spv_assume:
     if (STI.canUseExtension(SPIRV::Extension::SPV_KHR_expect_assume))
       BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpAssumeTrueKHR))
@@ -1480,6 +1566,20 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
   return true;
 }
 
+bool SPIRVInstructionSelector::selectAllocaArray(Register ResVReg,
+                                                 const SPIRVType *ResType,
+                                                 MachineInstr &I) const {
+  // there was an allocation size parameter to the allocation instruction
+  // that is not 1
+  MachineBasicBlock &BB = *I.getParent();
+  return BuildMI(BB, I, I.getDebugLoc(),
+                 TII.get(SPIRV::OpVariableLengthArrayINTEL))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .addUse(I.getOperand(2).getReg())
+      .constrainAllUses(TII, TRI, RBI);
+}
+
 bool SPIRVInstructionSelector::selectFrameIndex(Register ResVReg,
                                                 const SPIRVType *ResType,
                                                 MachineInstr &I) const {
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index 4f2e7a240fc2..049ca4ac818c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -184,7 +184,9 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
         return Query.Types[0].getSizeInBits() == Query.Types[1].getSizeInBits();
       }))));
 
-  getActionDefinitionsBuilder(G_IMPLICIT_DEF).alwaysLegal();
+  getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}).alwaysLegal();
+
+  getActionDefinitionsBuilder({G_STACKSAVE, G_STACKRESTORE}).alwaysLegal();
 
   getActionDefinitionsBuilder(G_INTTOPTR)
       .legalForCartesianProduct(allPtrs, allIntScalars);
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 3be28c97d953..ac3d6b362d35 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1110,6 +1110,14 @@ void addInstrRequirements(const MachineInstr &MI,
   case SPIRV::OpAtomicFMaxEXT:
     AddAtomicFloatRequirements(MI, Reqs, ST);
     break;
+  case SPIRV::OpVariableLengthArrayINTEL:
+  case SPIRV::OpSaveMemoryINTEL:
+  case SPIRV::OpRestoreMemoryINTEL:
+    if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_variable_length_array)) {
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_variable_length_array);
+      Reqs.addCapability(SPIRV::Capability::VariableLengthArrayINTEL);
+    }
+    break;
   default:
     break;
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
index 79f16146ccd9..0e8952dc6a9c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -85,6 +85,10 @@ cl::list<SPIRV::Extension::Extension> Extensions(
                    "SPV_KHR_subgroup_rotate",
                    "Adds a new instruction that enables rotating values across "
                    "invocations within a subgroup."),
+        clEnumValN(SPIRV::Extension::SPV_INTEL_variable_length_array,
+                   "SPV_INTEL_variable_length_array",
+                   "Allows to allocate local arrays whose number of elements "
+                   "is unknown at compile time."),
         clEnumValN(SPIRV::Extension::SPV_INTEL_function_pointers,
                    "SPV_INTEL_function_pointers",
                    "Allows translation of function pointers.")));
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index b022b97408d7..211c22340eb8 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -296,6 +296,7 @@ defm SPV_INTEL_fpga_latency_control : ExtensionOperand<101>;
 defm SPV_INTEL_fpga_argument_interfaces : ExtensionOperand<102>;
 defm SPV_INTEL_optnone : ExtensionOperand<103>;
 defm SPV_INTEL_function_pointers : ExtensionOperand<104>;
+defm SPV_INTEL_variable_length_array : ExtensionOperand<105>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define Capabilities enum values and at the same time
@@ -462,6 +463,7 @@ defm AtomicFloat16AddEXT : CapabilityOperand<6095, 0, 0, [SPV_EXT_shader_atomic_
 defm AtomicFloat16MinMaxEXT : CapabilityOperand<5616, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>;
 defm AtomicFloat32MinMaxEXT : CapabilityOperand<5612, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>;
 defm AtomicFloat64MinMaxEXT : CapabilityOperand<5613, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>;
+defm VariableLengthArrayINTEL : CapabilityOperand<5817, 0, 0, [SPV_INTEL_variable_length_array], []>;
 defm GroupUniformArithmeticKHR : CapabilityOperand<6400, 0, 0, [SPV_KHR_uniform_group_instructions], []>;
 defm USMStorageClassesINTEL : CapabilityOperand<5935, 0, 0, [SPV_INTEL_usm_storage_classes], [Kernel]>;
 
diff --git a/llvm/lib/Target/WebAssembly/CMakeLists.txt b/llvm/lib/Target/WebAssembly/CMakeLists.txt
index bb2ccea5c145..f430be2653b4 100644
--- a/llvm/lib/Target/WebAssembly/CMakeLists.txt
+++ b/llvm/lib/Target/WebAssembly/CMakeLists.txt
@@ -43,6 +43,7 @@ add_llvm_target(WebAssemblyCodeGen
   WebAssemblyOptimizeLiveIntervals.cpp
   WebAssemblyOptimizeReturned.cpp
   WebAssemblyPeephole.cpp
+  WebAssemblyRefTypeMem2Local.cpp
   WebAssemblyRegisterInfo.cpp
   WebAssemblyRegColoring.cpp
   WebAssemblyRegNumbering.cpp
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.h b/llvm/lib/Target/WebAssembly/WebAssembly.h
index 91765ad117bd..1c40addb6d6f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssembly.h
+++ b/llvm/lib/Target/WebAssembly/WebAssembly.h
@@ -30,6 +30,7 @@ ModulePass *createWebAssemblyAddMissingPrototypes();
 ModulePass *createWebAssemblyFixFunctionBitcasts();
 FunctionPass *createWebAssemblyOptimizeReturned();
 FunctionPass *createWebAssemblyLowerRefTypesIntPtrConv();
+FunctionPass *createWebAssemblyRefTypeMem2Local();
 
 // ISel and immediate followup passes.
 FunctionPass *createWebAssemblyISelDag(WebAssemblyTargetMachine &TM,
@@ -59,6 +60,7 @@ ModulePass *createWebAssemblyMCLowerPrePass();
 // PassRegistry initialization declarations.
 void initializeFixFunctionBitcastsPass(PassRegistry &);
 void initializeOptimizeReturnedPass(PassRegistry &);
+void initializeWebAssemblyRefTypeMem2LocalPass(PassRegistry &);
 void initializeWebAssemblyAddMissingPrototypesPass(PassRegistry &);
 void initializeWebAssemblyArgumentMovePass(PassRegistry &);
 void initializeWebAssemblyCFGSortPass(PassRegistry &);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 36f067956e63..7c47790d1e35 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -43,8 +43,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-lower"
 
-extern cl::opt<bool> WasmEmitMultiValue;
-
 WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     const TargetMachine &TM, const WebAssemblySubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
@@ -1290,7 +1288,7 @@ bool WebAssemblyTargetLowering::CanLowerReturn(
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     LLVMContext & /*Context*/) const {
   // WebAssembly can only handle returning tuples with multivalue enabled
-  return (Subtarget->hasMultivalue() && WasmEmitMultiValue) || Outs.size() <= 1;
+  return Subtarget->hasMultivalue() || Outs.size() <= 1;
 }
 
 SDValue WebAssemblyTargetLowering::LowerReturn(
@@ -1298,8 +1296,7 @@ SDValue WebAssemblyTargetLowering::LowerReturn(
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
     SelectionDAG &DAG) const {
-  assert(((Subtarget->hasMultivalue() && WasmEmitMultiValue) ||
-          Outs.size() <= 1) &&
+  assert((Subtarget->hasMultivalue() || Outs.size() <= 1) &&
          "MVP WebAssembly can only return up to one value");
   if (!callingConvSupported(CallConv))
     fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
index b969b8370a3e..1e959111a4db 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -22,8 +22,6 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
-extern cl::opt<bool> WasmEmitMultiValue;
-
 WebAssemblyFunctionInfo::~WebAssemblyFunctionInfo() = default; // anchor.
 
 MachineFunctionInfo *WebAssemblyFunctionInfo::clone(
@@ -73,8 +71,7 @@ void llvm::computeSignatureVTs(const FunctionType *Ty,
 
   MVT PtrVT = MVT::getIntegerVT(TM.createDataLayout().getPointerSizeInBits());
   if (Results.size() > 1 &&
-      (!TM.getSubtarget<WebAssemblySubtarget>(ContextFunc).hasMultivalue() ||
-       !WasmEmitMultiValue)) {
+      !TM.getSubtarget<WebAssemblySubtarget>(ContextFunc).hasMultivalue()) {
     // WebAssembly can't lower returns of multiple values without demoting to
     // sret unless multivalue is enabled (see
     // WebAssemblyTargetLowering::CanLowerReturn). So replace multiple return
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRefTypeMem2Local.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRefTypeMem2Local.cpp
new file mode 100644
index 000000000000..d3c60ee289df
--- /dev/null
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRefTypeMem2Local.cpp
@@ -0,0 +1,91 @@
+//=== WebAssemblyRefTypeMem2Local.cpp - WebAssembly RefType Mem2Local -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Assign reference type allocas to local addrspace (addrspace(1)) so that
+/// their loads and stores can be lowered to local.gets/local.sets.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Utils/WasmAddressSpaces.h"
+#include "Utils/WebAssemblyTypeUtilities.h"
+#include "WebAssembly.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-ref-type-mem2local"
+
+namespace {
+class WebAssemblyRefTypeMem2Local final
+    : public FunctionPass,
+      public InstVisitor<WebAssemblyRefTypeMem2Local> {
+  StringRef getPassName() const override {
+    return "WebAssembly Reference Types Memory to Local";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnFunction(Function &F) override;
+  bool Changed = false;
+
+public:
+  static char ID;
+  WebAssemblyRefTypeMem2Local() : FunctionPass(ID) {}
+
+  void visitAllocaInst(AllocaInst &AI);
+};
+} // End anonymous namespace
+
+char WebAssemblyRefTypeMem2Local::ID = 0;
+INITIALIZE_PASS(WebAssemblyRefTypeMem2Local, DEBUG_TYPE,
+                "Assign reference type allocas to local address space", true,
+                false)
+
+FunctionPass *llvm::createWebAssemblyRefTypeMem2Local() {
+  return new WebAssemblyRefTypeMem2Local();
+}
+
+void WebAssemblyRefTypeMem2Local::visitAllocaInst(AllocaInst &AI) {
+  if (WebAssembly::isWebAssemblyReferenceType(AI.getAllocatedType())) {
+    Changed = true;
+    IRBuilder<> IRB(AI.getContext());
+    IRB.SetInsertPoint(&AI);
+    auto *NewAI = IRB.CreateAlloca(AI.getAllocatedType(),
+                                   WebAssembly::WASM_ADDRESS_SPACE_VAR, nullptr,
+                                   AI.getName() + ".var");
+
+    // The below is basically equivalent to AI.replaceAllUsesWith(NewAI), but we
+    // cannot use it because it requires the old and new types be the same,
+    // which is not true here because the address spaces are different.
+    if (AI.hasValueHandle())
+      ValueHandleBase::ValueIsRAUWd(&AI, NewAI);
+    if (AI.isUsedByMetadata())
+      ValueAsMetadata::handleRAUW(&AI, NewAI);
+    while (!AI.materialized_use_empty()) {
+      Use &U = *AI.materialized_use_begin();
+      U.set(NewAI);
+    }
+
+    AI.eraseFromParent();
+  }
+}
+
+bool WebAssemblyRefTypeMem2Local::runOnFunction(Function &F) {
+  LLVM_DEBUG(dbgs() << "********** WebAssembly RefType Mem2Local **********\n"
+                       "********** Function: "
+                    << F.getName() << '\n');
+
+  visit(F);
+  return Changed;
+}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index 2a84c90c8960..3e2e029695ab 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -24,8 +24,6 @@
 
 using namespace llvm;
 
-extern cl::opt<bool> WasmEmitMultiValue;
-
 namespace {
 
 enum RuntimeLibcallSignature {
@@ -696,7 +694,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(PtrTy);
     break;
   case i64_i64_func_f32:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -705,7 +703,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::F32);
     break;
   case i64_i64_func_f64:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -714,7 +712,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::F64);
     break;
   case i16_i16_func_i16_i16:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I32);
       Rets.push_back(wasm::ValType::I32);
     } else {
@@ -724,7 +722,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I32);
     break;
   case i32_i32_func_i32_i32:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I32);
       Rets.push_back(wasm::ValType::I32);
     } else {
@@ -734,7 +732,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I32);
     break;
   case i64_i64_func_i64_i64:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -744,7 +742,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i64_i64_i64_i64:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -756,7 +754,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i64_i64_i64_i64_iPTR:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -769,7 +767,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(PtrTy);
     break;
   case i64_i64_i64_i64_func_i64_i64_i64_i64:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
@@ -783,7 +781,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i64_i64_i32:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -853,7 +851,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i64_i64_i64_i64_i64_i64:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -867,7 +865,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i32:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -876,7 +874,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I32);
     break;
   case i64_i64_func_i64:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index b2f7ee970a73..4d4cae110148 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -54,15 +54,6 @@ static cl::opt<bool> WasmDisableFixIrreducibleControlFlowPass(
              " irreducible control flow optimization pass"),
     cl::init(false));
 
-// A temporary option to control emission of multivalue until multivalue
-// implementation is stable enough. We currently don't emit multivalue by
-// default even if the feature section allows it.
-// TODO Stabilize multivalue and delete this option
-cl::opt<bool>
-    WasmEmitMultiValue("wasm-emit-multivalue", cl::Hidden,
-                       cl::desc("WebAssembly: Emit multivalue in the backend"),
-                       cl::init(false));
-
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTarget() {
   // Register the target.
   RegisterTargetMachine<WebAssemblyTargetMachine> X(
@@ -77,6 +68,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTarget() {
   initializeLowerGlobalDtorsLegacyPassPass(PR);
   initializeFixFunctionBitcastsPass(PR);
   initializeOptimizeReturnedPass(PR);
+  initializeWebAssemblyRefTypeMem2LocalPass(PR);
   initializeWebAssemblyArgumentMovePass(PR);
   initializeWebAssemblySetP2AlignOperandsPass(PR);
   initializeWebAssemblyReplacePhysRegsPass(PR);
diff --git a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
index a620ba911ec6..6ef28b373cc9 100644
--- a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
+++ b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
@@ -22,7 +22,7 @@ namespace {
 class X86CodeGenPassBuilder : public CodeGenPassBuilder<X86CodeGenPassBuilder> {
 public:
   explicit X86CodeGenPassBuilder(LLVMTargetMachine &TM,
-                                 CGPassBuilderOption Opts,
+                                 const CGPassBuilderOption &Opts,
                                  PassInstrumentationCallbacks *PIC)
       : CodeGenPassBuilder(TM, Opts, PIC) {}
   void addPreISel(AddIRPass &addPass) const;
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index be416fb0db06..d914e1b61ab0 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -1418,6 +1418,34 @@ bool X86FrameLowering::needsDwarfCFI(const MachineFunction &MF) const {
   return !isWin64Prologue(MF) && MF.needsFrameMoves();
 }
 
+/// Return true if an opcode is part of the REP group of instructions
+static bool isOpcodeRep(unsigned Opcode) {
+  switch (Opcode) {
+  case X86::REPNE_PREFIX:
+  case X86::REP_MOVSB_32:
+  case X86::REP_MOVSB_64:
+  case X86::REP_MOVSD_32:
+  case X86::REP_MOVSD_64:
+  case X86::REP_MOVSQ_32:
+  case X86::REP_MOVSQ_64:
+  case X86::REP_MOVSW_32:
+  case X86::REP_MOVSW_64:
+  case X86::REP_PREFIX:
+  case X86::REP_STOSB_32:
+  case X86::REP_STOSB_64:
+  case X86::REP_STOSD_32:
+  case X86::REP_STOSD_64:
+  case X86::REP_STOSQ_32:
+  case X86::REP_STOSQ_64:
+  case X86::REP_STOSW_32:
+  case X86::REP_STOSW_64:
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
+
 /// emitPrologue - Push callee-saved registers onto the stack, which
 /// automatically adjust the stack pointer. Adjust the stack pointer to allocate
 /// space for local variables. Also emit labels used by the exception handler to
@@ -2194,13 +2222,44 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   // flag (DF in EFLAGS register). Clear this flag by creating "cld" instruction
   // in each prologue of interrupt handler function.
   //
-  // FIXME: Create "cld" instruction only in these cases:
+  // Create "cld" instruction only in these cases:
   // 1. The interrupt handling function uses any of the "rep" instructions.
   // 2. Interrupt handling function calls another function.
+  // 3. If there are any inline asm blocks, as we do not know what they do
   //
-  if (Fn.getCallingConv() == CallingConv::X86_INTR)
-    BuildMI(MBB, MBBI, DL, TII.get(X86::CLD))
-        .setMIFlag(MachineInstr::FrameSetup);
+  // TODO: We should also emit cld if we detect the use of std, but as of now,
+  // the compiler does not even emit that instruction or even define it, so in
+  // practice, this would only happen with inline asm, which we cover anyway.
+  if (Fn.getCallingConv() == CallingConv::X86_INTR) {
+    bool NeedsCLD = false;
+
+    for (const MachineBasicBlock &B : MF) {
+      for (const MachineInstr &MI : B) {
+        if (MI.isCall()) {
+          NeedsCLD = true;
+          break;
+        }
+
+        if (isOpcodeRep(MI.getOpcode())) {
+          NeedsCLD = true;
+          break;
+        }
+
+        if (MI.isInlineAsm()) {
+          // TODO: Parse asm for rep instructions or call sites?
+          // For now, let's play it safe and emit a cld instruction
+          // just in case.
+          NeedsCLD = true;
+          break;
+        }
+      }
+    }
+
+    if (NeedsCLD) {
+      BuildMI(MBB, MBBI, DL, TII.get(X86::CLD))
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+  }
 
   // At this point we know if the function has WinCFI or not.
   MF.setHasWinCFI(HasWinCFI);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a86f13135173..bec13d1c00ef 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1241,11 +1241,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::ABS,                MVT::v16i8, Legal);
     setOperationAction(ISD::ABS,                MVT::v8i16, Legal);
     setOperationAction(ISD::ABS,                MVT::v4i32, Legal);
-    setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
-    setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
-    setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
-    setOperationAction(ISD::CTLZ,               MVT::v4i32, Custom);
-    setOperationAction(ISD::CTLZ,               MVT::v2i64, Custom);
+
+    for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
+      setOperationAction(ISD::BITREVERSE,       VT, Custom);
+      setOperationAction(ISD::CTLZ,             VT, Custom);
+    }
 
     // These might be better off as horizontal vector ops.
     setOperationAction(ISD::ADD,                MVT::i16, Custom);
@@ -1341,10 +1341,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     // XOP can efficiently perform BITREVERSE with VPPERM.
     for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
       setOperationAction(ISD::BITREVERSE, VT, Custom);
-
-    for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
-                     MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
-      setOperationAction(ISD::BITREVERSE, VT, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
@@ -1461,12 +1457,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::TRUNCATE,          MVT::v32i32, Custom);
     setOperationAction(ISD::TRUNCATE,          MVT::v32i64, Custom);
 
-    setOperationAction(ISD::BITREVERSE,        MVT::v32i8, Custom);
-
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
       setOperationAction(ISD::SETCC,           VT, Custom);
       setOperationAction(ISD::CTPOP,           VT, Custom);
       setOperationAction(ISD::CTLZ,            VT, Custom);
+      setOperationAction(ISD::BITREVERSE,      VT, Custom);
 
       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
       // setcc all the way to isel and prefer SETGT in some isel patterns.
@@ -1841,8 +1836,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
     setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
 
-    setOperationAction(ISD::BITREVERSE, MVT::v64i8,  Custom);
-
     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
       setOperationAction(ISD::SRL,              VT, Custom);
       setOperationAction(ISD::SHL,              VT, Custom);
@@ -1852,6 +1845,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::SETCC,            VT, Custom);
       setOperationAction(ISD::ABDS,             VT, Custom);
       setOperationAction(ISD::ABDU,             VT, Custom);
+      setOperationAction(ISD::BITREVERSE,       VT, Custom);
 
       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
       // setcc all the way to isel and prefer SETGT in some isel patterns.
@@ -31180,17 +31174,25 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
   SDValue In = Op.getOperand(0);
   SDLoc DL(Op);
 
-  assert(VT.getScalarType() == MVT::i8 &&
-         "Only byte vector BITREVERSE supported");
-
-  // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
-  if (VT == MVT::v64i8 && !Subtarget.hasBWI())
+  // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
+  if (VT.is512BitVector() && !Subtarget.hasBWI())
     return splitVectorIntUnary(Op, DAG);
 
   // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
-  if (VT == MVT::v32i8 && !Subtarget.hasInt256())
+  if (VT.is256BitVector() && !Subtarget.hasInt256())
     return splitVectorIntUnary(Op, DAG);
 
+  // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
+  if (VT.getScalarType() != MVT::i8) {
+    MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
+    SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
+    Res = DAG.getBitcast(ByteVT, Res);
+    Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
+    return DAG.getBitcast(VT, Res);
+  }
+  assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
+         "Only byte vector BITREVERSE supported");
+
   unsigned NumElts = VT.getVectorNumElements();
 
   // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
@@ -32341,20 +32343,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       }
     }
 
-    if (128 % InBits == 0) {
+    if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
       // 128 bit and smaller inputs should avoid truncate all together and
-      // just use a build_vector that will become a shuffle.
-      // TODO: Widen and use a shuffle directly?
-      SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
-      // Use the original element count so we don't do more scalar opts than
-      // necessary.
-      for (unsigned i=0; i < MinElts; ++i) {
-        SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
-                                  DAG.getIntPtrConstant(i, dl));
-        Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
-      }
-      Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
-      return;
+      // use a shuffle.
+      if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
+        int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
+        SmallVector<int, 16> TruncMask(WidenNumElts, -1);
+        for (unsigned I = 0; I < MinElts; ++I)
+          TruncMask[I] = Scale * I;
+        SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
+        assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
+               "Illegal vector type in truncation");
+        WidenIn = DAG.getBitcast(WidenVT, WidenIn);
+        Results.push_back(
+            DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
+        return;
+      }
     }
 
     // With AVX512 there are some cases that can use a target specific
@@ -55785,6 +55789,15 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  auto IsExtractFree = [](SDValue V) {
+    V = peekThroughBitcasts(V);
+    if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
+      return true;
+    if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode()))
+      return true;
+    return V.isUndef();
+  };
+
   // If we're extracting the lowest subvector and we're the only user,
   // we may be able to perform this with a smaller vector width.
   unsigned InOpcode = InVec.getOpcode();
@@ -55826,14 +55839,27 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
     }
     if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
-        (VT.is128BitVector() || VT.is256BitVector())) {
+        (SizeInBits == 128 || SizeInBits == 256)) {
       SDValue InVecSrc = InVec.getOperand(0);
       unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
       SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
       return DAG.getNode(InOpcode, DL, VT, Ext);
     }
+    if ((InOpcode == X86ISD::CMPP || InOpcode == X86ISD::PCMPEQ ||
+         InOpcode == X86ISD::PCMPGT) &&
+        (IsExtractFree(InVec.getOperand(0)) ||
+         IsExtractFree(InVec.getOperand(1))) &&
+        SizeInBits == 128) {
+      SDValue Ext0 =
+          extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
+      SDValue Ext1 =
+          extractSubVector(InVec.getOperand(1), IdxVal, DAG, DL, SizeInBits);
+      if (InOpcode == X86ISD::CMPP)
+        return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, InVec.getOperand(2));
+      return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1);
+    }
     if (InOpcode == X86ISD::MOVDDUP &&
-        (VT.is128BitVector() || VT.is256BitVector())) {
+        (SizeInBits == 128 || SizeInBits == 256)) {
       SDValue Ext0 =
           extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
       return DAG.getNode(InOpcode, DL, VT, Ext0);
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 323f33fcc862..271d3ed40030 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -1381,7 +1381,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
       // not fully matching stack contexts. To do this, subtract any context ids
       // found in caller nodes of the last node found above.
       if (Ids.back() != getLastStackId(Call)) {
-        for (const auto &PE : CurNode->CallerEdges) {
+        for (const auto &PE : LastNode->CallerEdges) {
           set_subtract(StackSequenceContextIds, PE->getContextIds());
           if (StackSequenceContextIds.empty())
             break;
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 8a2864a07873..5d5c4ea57ed5 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1957,6 +1957,10 @@ bool ModuleAddressSanitizer::shouldInstrumentGlobal(GlobalVariable *G) const {
     // On COFF, don't instrument non-ODR linkages.
     if (G->isInterposable())
       return false;
+    // If the global has AvailableExternally linkage, then it is not in this
+    // module, which means it does not need to be instrumented.
+    if (G->hasAvailableExternallyLinkage())
+      return false;
   }
 
   // If a comdat is present, it must have a selection kind that implies ODR
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 9b6a39e98f5c..7e48c28176bd 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -461,7 +461,7 @@ static Decomposition decomposeGEP(GEPOperator &GEP,
 
     // If Op0 is signed non-negative, the GEP is increasing monotonically and
     // can be de-composed.
-    if (!isKnownNonNegative(Index, DL, /*Depth=*/MaxAnalysisRecursionDepth - 1))
+    if (!isKnownNonNegative(Index, DL))
       Preconditions.emplace_back(CmpInst::ICMP_SGE, Index,
                                  ConstantInt::get(Index->getType(), 0));
   }
@@ -560,10 +560,10 @@ static Decomposition decompose(Value *V,
     return MergeResults(Op0, Op1, IsSigned);
   }
   if (match(V, m_NSWAdd(m_Value(Op0), m_Value(Op1)))) {
-    if (!isKnownNonNegative(Op0, DL, /*Depth=*/MaxAnalysisRecursionDepth - 1))
+    if (!isKnownNonNegative(Op0, DL))
       Preconditions.emplace_back(CmpInst::ICMP_SGE, Op0,
                                  ConstantInt::get(Op0->getType(), 0));
-    if (!isKnownNonNegative(Op1, DL, /*Depth=*/MaxAnalysisRecursionDepth - 1))
+    if (!isKnownNonNegative(Op1, DL))
       Preconditions.emplace_back(CmpInst::ICMP_SGE, Op1,
                                  ConstantInt::get(Op1->getType(), 0));
 
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 6c8785d52c4e..c7b9ce2e9312 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -293,7 +293,7 @@ calculateFragment(DILocalVariable *Variable,
   if (!CurrentFragment) {
     if (auto Size = Variable->getSizeInBits()) {
       // Treat the current fragment as covering the whole variable.
-      CurrentFragment =  DIExpression::FragmentInfo(*Size, 0);
+      CurrentFragment = DIExpression::FragmentInfo(*Size, 0);
       if (Target == CurrentFragment)
         return UseNoFrag;
     }
@@ -1213,8 +1213,9 @@ private:
     if (!IsOffsetKnown)
       return PI.setAborted(&II);
 
-    insertUse(II, Offset, Length ? Length->getLimitedValue()
-                                 : AllocSize - Offset.getLimitedValue(),
+    insertUse(II, Offset,
+              Length ? Length->getLimitedValue()
+                     : AllocSize - Offset.getLimitedValue(),
               (bool)Length);
   }
 
@@ -1669,7 +1670,7 @@ static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
   }
 
   // Inject loads into all of the pred blocks.
-  DenseMap<BasicBlock*, Value*> InjectedLoads;
+  DenseMap<BasicBlock *, Value *> InjectedLoads;
   for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
     BasicBlock *Pred = PN.getIncomingBlock(Idx);
     Value *InVal = PN.getIncomingValue(Idx);
@@ -1678,7 +1679,7 @@ static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
     // basic block, as long as the value is the same. So if we already injected
     // a load in the predecessor, then we should reuse the same load for all
     // duplicated entries.
-    if (Value* V = InjectedLoads.lookup(Pred)) {
+    if (Value *V = InjectedLoads.lookup(Pred)) {
       NewPN->addIncoming(V, Pred);
       continue;
     }
@@ -2077,8 +2078,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
   if (BeginIndex * ElementSize != BeginOffset ||
       BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
     return false;
-  uint64_t EndOffset =
-      std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
+  uint64_t EndOffset = std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
   uint64_t EndIndex = EndOffset / ElementSize;
   if (EndIndex * ElementSize != EndOffset ||
       EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
@@ -2754,8 +2754,8 @@ public:
     Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
     IRB.SetInsertPoint(OldUserI);
     IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
-    IRB.getInserter().SetNamePrefix(
-        Twine(NewAI.getName()) + "." + Twine(BeginOffset) + ".");
+    IRB.getInserter().SetNamePrefix(Twine(NewAI.getName()) + "." +
+                                    Twine(BeginOffset) + ".");
 
     CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
     if (VecTy || IntTy)
@@ -2808,7 +2808,7 @@ private:
 #else
                           Twine()
 #endif
-                          );
+    );
   }
 
   /// Compute suitable alignment to access this slice of the *new*
@@ -3189,8 +3189,7 @@ private:
     const bool CanContinue = [&]() {
       if (VecTy || IntTy)
         return true;
-      if (BeginOffset > NewAllocaBeginOffset ||
-          EndOffset < NewAllocaEndOffset)
+      if (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset)
         return false;
       // Length must be in range for FixedVectorType.
       auto *C = cast<ConstantInt>(II.getLength());
@@ -3957,11 +3956,11 @@ private:
     return false;
   }
 
-  // Fold gep (select cond, ptr1, ptr2), idx
+  // Unfold gep (select cond, ptr1, ptr2), idx
   //   => select cond, gep(ptr1, idx), gep(ptr2, idx)
   // and  gep ptr, (select cond, idx1, idx2)
   //   => select cond, gep(ptr, idx1), gep(ptr, idx2)
-  bool foldGEPSelect(GetElementPtrInst &GEPI) {
+  bool unfoldGEPSelect(GetElementPtrInst &GEPI) {
     // Check whether the GEP has exactly one select operand and all indices
     // will become constant after the transform.
     SelectInst *Sel = dyn_cast<SelectInst>(GEPI.getPointerOperand());
@@ -3984,9 +3983,9 @@ private:
     if (!Sel)
       return false;
 
-    LLVM_DEBUG(dbgs() << "  Rewriting gep(select) -> select(gep):"
-                      << "\n    original: " << *Sel
-                      << "\n              " << GEPI);
+    LLVM_DEBUG(dbgs() << "  Rewriting gep(select) -> select(gep):\n";
+               dbgs() << "    original: " << *Sel << "\n";
+               dbgs() << "              " << GEPI << "\n";);
 
     auto GetNewOps = [&](Value *SelOp) {
       SmallVector<Value *> NewOps;
@@ -4023,74 +4022,107 @@ private:
     Visited.insert(NSelI);
     enqueueUsers(*NSelI);
 
-    LLVM_DEBUG(dbgs() << "\n          to: " << *NTrue
-                      << "\n              " << *NFalse
-                      << "\n              " << *NSel << '\n');
+    LLVM_DEBUG(dbgs() << "          to: " << *NTrue << "\n";
+               dbgs() << "              " << *NFalse << "\n";
+               dbgs() << "              " << *NSel << "\n";);
 
     return true;
   }
 
-  // Fold gep (phi ptr1, ptr2) => phi gep(ptr1), gep(ptr2)
-  bool foldGEPPhi(GetElementPtrInst &GEPI) {
-    if (!GEPI.hasAllConstantIndices())
-      return false;
+  // Unfold gep (phi ptr1, ptr2), idx
+  //   => phi ((gep ptr1, idx), (gep ptr2, idx))
+  // and  gep ptr, (phi idx1, idx2)
+  //   => phi ((gep ptr, idx1), (gep ptr, idx2))
+  bool unfoldGEPPhi(GetElementPtrInst &GEPI) {
+    // To prevent infinitely expanding recursive phis, bail if the GEP pointer
+    // operand (looking through the phi if it is the phi we want to unfold) is
+    // an instruction besides an alloca.
+    PHINode *Phi = dyn_cast<PHINode>(GEPI.getPointerOperand());
+    auto IsInvalidPointerOperand = [](Value *V) {
+      return isa<Instruction>(V) && !isa<AllocaInst>(V);
+    };
+    if (Phi) {
+      if (any_of(Phi->operands(), IsInvalidPointerOperand))
+        return false;
+    } else {
+      if (IsInvalidPointerOperand(GEPI.getPointerOperand()))
+        return false;
+    }
+    // Check whether the GEP has exactly one phi operand (including the pointer
+    // operand) and all indices will become constant after the transform.
+    for (Value *Op : GEPI.indices()) {
+      if (auto *SI = dyn_cast<PHINode>(Op)) {
+        if (Phi)
+          return false;
+
+        Phi = SI;
+        if (!all_of(Phi->incoming_values(),
+                    [](Value *V) { return isa<ConstantInt>(V); }))
+          return false;
+        continue;
+      }
+
+      if (!isa<ConstantInt>(Op))
+        return false;
+    }
 
-    PHINode *PHI = cast<PHINode>(GEPI.getPointerOperand());
-    if (GEPI.getParent() != PHI->getParent() ||
-        llvm::any_of(PHI->incoming_values(), [](Value *In)
-          { Instruction *I = dyn_cast<Instruction>(In);
-            return !I || isa<GetElementPtrInst>(I) || isa<PHINode>(I) ||
-                   succ_empty(I->getParent()) ||
-                   !I->getParent()->isLegalToHoistInto();
-          }))
+    if (!Phi)
       return false;
 
-    LLVM_DEBUG(dbgs() << "  Rewriting gep(phi) -> phi(gep):"
-                      << "\n    original: " << *PHI
-                      << "\n              " << GEPI
-                      << "\n          to: ");
+    LLVM_DEBUG(dbgs() << "  Rewriting gep(phi) -> phi(gep):\n";
+               dbgs() << "    original: " << *Phi << "\n";
+               dbgs() << "              " << GEPI << "\n";);
 
-    SmallVector<Value *, 4> Index(GEPI.indices());
-    bool IsInBounds = GEPI.isInBounds();
-    IRB.SetInsertPoint(GEPI.getParent(), GEPI.getParent()->getFirstNonPHIIt());
-    PHINode *NewPN = IRB.CreatePHI(GEPI.getType(), PHI->getNumIncomingValues(),
-                                   PHI->getName() + ".sroa.phi");
-    for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) {
-      BasicBlock *B = PHI->getIncomingBlock(I);
-      Value *NewVal = nullptr;
-      int Idx = NewPN->getBasicBlockIndex(B);
-      if (Idx >= 0) {
-        NewVal = NewPN->getIncomingValue(Idx);
-      } else {
-        Instruction *In = cast<Instruction>(PHI->getIncomingValue(I));
+    auto GetNewOps = [&](Value *PhiOp) {
+      SmallVector<Value *> NewOps;
+      for (Value *Op : GEPI.operands())
+        if (Op == Phi)
+          NewOps.push_back(PhiOp);
+        else
+          NewOps.push_back(Op);
+      return NewOps;
+    };
 
-        IRB.SetInsertPoint(In->getParent(), std::next(In->getIterator()));
-        Type *Ty = GEPI.getSourceElementType();
-        NewVal = IRB.CreateGEP(Ty, In, Index, In->getName() + ".sroa.gep",
-                               IsInBounds);
-      }
-      NewPN->addIncoming(NewVal, B);
+    IRB.SetInsertPoint(Phi);
+    PHINode *NewPhi = IRB.CreatePHI(GEPI.getType(), Phi->getNumIncomingValues(),
+                                    Phi->getName() + ".sroa.phi");
+
+    bool IsInBounds = GEPI.isInBounds();
+    Type *SourceTy = GEPI.getSourceElementType();
+    // We only handle arguments, constants, and static allocas here, so we can
+    // insert GEPs at the beginning of the function after static allocas.
+    IRB.SetInsertPointPastAllocas(GEPI.getFunction());
+    for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
+      Value *Op = Phi->getIncomingValue(I);
+      BasicBlock *BB = Phi->getIncomingBlock(I);
+      SmallVector<Value *> NewOps = GetNewOps(Op);
+
+      Value *NewGEP =
+          IRB.CreateGEP(SourceTy, NewOps[0], ArrayRef(NewOps).drop_front(),
+                        Phi->getName() + ".sroa.gep", IsInBounds);
+      NewPhi->addIncoming(NewGEP, BB);
     }
 
     Visited.erase(&GEPI);
-    GEPI.replaceAllUsesWith(NewPN);
+    GEPI.replaceAllUsesWith(NewPhi);
     GEPI.eraseFromParent();
-    Visited.insert(NewPN);
-    enqueueUsers(*NewPN);
+    Visited.insert(NewPhi);
+    enqueueUsers(*NewPhi);
 
-    LLVM_DEBUG(for (Value *In : NewPN->incoming_values())
-                 dbgs() << "\n              " << *In;
-               dbgs() << "\n              " << *NewPN << '\n');
+    LLVM_DEBUG(dbgs() << "          to: ";
+               for (Value *In
+                    : NewPhi->incoming_values()) dbgs()
+               << "\n              " << *In;
+               dbgs() << "\n              " << *NewPhi << '\n');
 
     return true;
   }
 
   bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
-    if (foldGEPSelect(GEPI))
+    if (unfoldGEPSelect(GEPI))
       return true;
 
-    if (isa<PHINode>(GEPI.getPointerOperand()) &&
-        foldGEPPhi(GEPI))
+    if (unfoldGEPPhi(GEPI))
       return true;
 
     enqueueUsers(GEPI);
@@ -4162,17 +4194,17 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
     return nullptr;
 
   if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
-     Type *ElementTy;
-     uint64_t TyNumElements;
-     if (auto *AT = dyn_cast<ArrayType>(Ty)) {
-       ElementTy = AT->getElementType();
-       TyNumElements = AT->getNumElements();
-     } else {
-       // FIXME: This isn't right for vectors with non-byte-sized or
-       // non-power-of-two sized elements.
-       auto *VT = cast<FixedVectorType>(Ty);
-       ElementTy = VT->getElementType();
-       TyNumElements = VT->getNumElements();
+    Type *ElementTy;
+    uint64_t TyNumElements;
+    if (auto *AT = dyn_cast<ArrayType>(Ty)) {
+      ElementTy = AT->getElementType();
+      TyNumElements = AT->getNumElements();
+    } else {
+      // FIXME: This isn't right for vectors with non-byte-sized or
+      // non-power-of-two sized elements.
+      auto *VT = cast<FixedVectorType>(Ty);
+      ElementTy = VT->getElementType();
+      TyNumElements = VT->getNumElements();
     }
     uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
     uint64_t NumSkippedElements = Offset / ElementSize;
@@ -4853,9 +4885,8 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
     ++NumNewAllocas;
   }
 
-  LLVM_DEBUG(dbgs() << "Rewriting alloca partition "
-                    << "[" << P.beginOffset() << "," << P.endOffset()
-                    << ") to: " << *NewAI << "\n");
+  LLVM_DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset()
+                    << "," << P.endOffset() << ") to: " << *NewAI << "\n");
 
   // Track the high watermark on the worklist as it is only relevant for
   // promoted allocas. We will reset it to this point if the alloca is not in
@@ -5040,8 +5071,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
         IsSorted = false;
       }
     }
-  }
-  else {
+  } else {
     // We only allow whole-alloca splittable loads and stores
     // for a large alloca to avoid creating too large BitVector.
     for (Slice &S : AS) {
@@ -5069,7 +5099,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
     uint64_t Offset;
     uint64_t Size;
     Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
-      : Alloca(AI), Offset(O), Size(S) {}
+        : Alloca(AI), Offset(O), Size(S) {}
   };
   SmallVector<Fragment, 4> Fragments;
 
@@ -5083,7 +5113,8 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
             DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedValue();
         // Don't include any padding.
         uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
-        Fragments.push_back(Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
+        Fragments.push_back(
+            Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
       }
     }
     ++NumPartitions;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 373ea751d568..b81cd508663c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -315,12 +315,6 @@ static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
     cl::desc(
         "Enable runtime interleaving until load/store ports are saturated"));
 
-/// Interleave small loops with scalar reductions.
-static cl::opt<bool> InterleaveSmallLoopScalarReduction(
-    "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
-    cl::desc("Enable interleaving for loops with small iteration counts that "
-             "contain scalar reductions to expose ILP."));
-
 /// The number of stores in a loop that are allowed to need predication.
 static cl::opt<unsigned> NumberOfStoresToPredicate(
     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
@@ -1510,19 +1504,36 @@ public:
   }
 
   /// Returns the TailFoldingStyle that is best for the current loop.
-  TailFoldingStyle
-  getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
-    if (!CanFoldTailByMasking)
-      return TailFoldingStyle::None;
+  TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
+    return IVUpdateMayOverflow ? ChosenTailFoldingStyle.first
+                               : ChosenTailFoldingStyle.second;
+  }
+
+  /// Selects and saves TailFoldingStyle for 2 options - if IV update may
+  /// overflow or not.
+  void setTailFoldingStyles() {
+    assert(ChosenTailFoldingStyle.first == TailFoldingStyle::None &&
+           ChosenTailFoldingStyle.second == TailFoldingStyle::None &&
+           "Tail folding must not be selected yet.");
+    if (!Legal->prepareToFoldTailByMasking())
+      return;
 
-    if (ForceTailFoldingStyle.getNumOccurrences())
-      return ForceTailFoldingStyle;
+    if (ForceTailFoldingStyle.getNumOccurrences()) {
+      ChosenTailFoldingStyle.first = ChosenTailFoldingStyle.second =
+          ForceTailFoldingStyle;
+      return;
+    }
 
-    return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow);
+    ChosenTailFoldingStyle.first =
+        TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true);
+    ChosenTailFoldingStyle.second =
+        TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false);
   }
 
   /// Returns true if all loop blocks should be masked to fold tail loop.
   bool foldTailByMasking() const {
+    // TODO: check if it is possible to check for None style independent of
+    // IVUpdateMayOverflow flag in getTailFoldingStyle.
     return getTailFoldingStyle() != TailFoldingStyle::None;
   }
 
@@ -1675,8 +1686,10 @@ private:
   /// iterations to execute in the scalar loop.
   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
 
-  /// All blocks of loop are to be masked to fold tail of scalar iterations.
-  bool CanFoldTailByMasking = false;
+  /// Control finally chosen tail folding style. The first element is used if
+  /// the IV update may overflow, the second element - if it does not.
+  std::pair<TailFoldingStyle, TailFoldingStyle> ChosenTailFoldingStyle =
+      std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
 
   /// A map holding scalar costs for different vectorization factors. The
   /// presence of a cost for an instruction in the mapping indicates that the
@@ -4633,10 +4646,9 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   // found modulo the vectorization factor is not zero, try to fold the tail
   // by masking.
   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
-  if (Legal->prepareToFoldTailByMasking()) {
-    CanFoldTailByMasking = true;
+  setTailFoldingStyles();
+  if (foldTailByMasking())
     return MaxFactors;
-  }
 
   // If there was a tail-folding hint/switch, but we can't fold the tail by
   // masking, fallback to a vectorization with a scalar epilogue.
@@ -5477,8 +5489,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
 
     // If there are scalar reductions and TTI has enabled aggressive
     // interleaving for reductions, we will interleave to expose ILP.
-    if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
-        AggressivelyInterleaveReductions) {
+    if (VF.isScalar() && AggressivelyInterleaveReductions) {
       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
       // Interleave no less than SmallIC but not as aggressive as the normal IC
       // to satisfy the rare situation when resources are too limited.
@@ -7450,12 +7461,14 @@ LoopVectorizationPlanner::executePlan(
       (IsEpilogueVectorization || !ExpandedSCEVs) &&
       "expanded SCEVs to reuse can only be used during epilogue vectorization");
 
-  LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
-                    << '\n');
-
   if (!IsEpilogueVectorization)
     VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
 
+  LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
+                    << ", UF=" << BestUF << '\n');
+  BestVPlan.setName("Final VPlan");
+  LLVM_DEBUG(BestVPlan.dump());
+
   // Perform the actual loop transformation.
   VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
                          OrigLoop->getHeader()->getContext());
@@ -9127,7 +9140,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
          "Unexpected type.");
 
   auto *IVR = getParent()->getPlan()->getCanonicalIV();
-  PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
+  PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
 
   if (onlyScalarsGenerated(State.VF.isScalable())) {
     // This is the normalized GEP that starts counting at zero.
@@ -9243,7 +9256,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
 
 void VPReductionRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Reduction being replicated.");
-  Value *PrevInChain = State.get(getChainOp(), 0);
+  Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true);
   RecurKind Kind = RdxDesc.getRecurrenceKind();
   bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc);
   // Propagate the fast-math flags carried by the underlying instruction.
@@ -9252,8 +9265,7 @@ void VPReductionRecipe::execute(VPTransformState &State) {
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     Value *NewVecOp = State.get(getVecOp(), Part);
     if (VPValue *Cond = getCondOp()) {
-      Value *NewCond = State.VF.isVector() ? State.get(Cond, Part)
-                                           : State.get(Cond, {Part, 0});
+      Value *NewCond = State.get(Cond, Part, State.VF.isScalar());
       VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
       Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
       Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
@@ -9278,7 +9290,7 @@ void VPReductionRecipe::execute(VPTransformState &State) {
             NewVecOp);
       PrevInChain = NewRed;
     } else {
-      PrevInChain = State.get(getChainOp(), Part);
+      PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true);
       NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
     }
     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
@@ -9289,7 +9301,7 @@ void VPReductionRecipe::execute(VPTransformState &State) {
     else
       NextInChain = State.Builder.CreateBinOp(
           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
-    State.set(this, NextInChain, Part);
+    State.set(this, NextInChain, Part, /*IsScalar*/ true);
   }
 }
 
@@ -9404,7 +9416,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
           // We don't want to update the value in the map as it might be used in
           // another expression. So don't call resetVectorValue(StoredVal).
         }
-        auto *VecPtr = State.get(getAddr(), Part);
+        auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true);
         if (isMaskRequired)
           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
                                             BlockInMaskParts[Part]);
@@ -9428,7 +9440,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
                                          nullptr, "wide.masked.gather");
       State.addMetadata(NewLI, LI);
     } else {
-      auto *VecPtr = State.get(getAddr(), Part);
+      auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true);
       if (isMaskRequired)
         NewLI = Builder.CreateMaskedLoad(
             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
@@ -10074,6 +10086,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
           auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn(
               ExpandedSCEVs.find(ExpandR->getSCEV())->second);
           ExpandR->replaceAllUsesWith(ExpandedVal);
+          if (BestEpiPlan.getTripCount() == ExpandR)
+            BestEpiPlan.resetTripCount(ExpandedVal);
           ExpandR->eraseFromParent();
         }
 
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index de4e56ff8065..2b7d518c1c1a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9364,7 +9364,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
                 VectorCasts
                     .insert(std::make_pair(ScalarTE, FTy->getElementType()))
                     .second) {
-              unsigned BWSz = It->second.second;
+              unsigned BWSz = It->second.first;
               unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
               unsigned VecOpcode;
               if (DstBWSz < BWSz)
@@ -9376,7 +9376,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
               InstructionCost C = TTI->getCastInstrCost(
                   VecOpcode, FTy,
                   FixedVectorType::get(
-                      IntegerType::get(FTy->getContext(), It->second.first),
+                      IntegerType::get(FTy->getContext(), BWSz),
                       FTy->getNumElements()),
                   TTI::CastContextHint::None, CostKind);
               LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 4ffbed4b705c..4aeab6fc6199 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -242,7 +242,16 @@ Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
   return Extract;
 }
 
-Value *VPTransformState::get(VPValue *Def, unsigned Part) {
+Value *VPTransformState::get(VPValue *Def, unsigned Part, bool NeedsScalar) {
+  if (NeedsScalar) {
+    assert((VF.isScalar() || Def->isLiveIn() ||
+            (hasScalarValue(Def, VPIteration(Part, 0)) &&
+             Data.PerPartScalars[Def][Part].size() == 1)) &&
+           "Trying to access a single scalar per part but has multiple scalars "
+           "per part.");
+    return get(Def, VPIteration(Part, 0));
+  }
+
   // If Values have been set for this Def return the one relevant for \p Part.
   if (hasVectorValue(Def, Part))
     return Data.PerPartOutput[Def][Part];
@@ -789,21 +798,15 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
     auto *TCMO = Builder.CreateSub(TripCountV,
                                    ConstantInt::get(TripCountV->getType(), 1),
                                    "trip.count.minus.1");
-    auto VF = State.VF;
-    Value *VTCMO =
-        VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast");
-    for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
-      State.set(BackedgeTakenCount, VTCMO, Part);
+    BackedgeTakenCount->setUnderlyingValue(TCMO);
   }
 
-  for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
-    State.set(&VectorTripCount, VectorTripCountV, Part);
+  VectorTripCount.setUnderlyingValue(VectorTripCountV);
 
   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
   // FIXME: Model VF * UF computation completely in VPlan.
-  State.set(&VFxUF,
-            createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF),
-            0);
+  VFxUF.setUnderlyingValue(
+      createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF));
 
   // When vectorizing the epilogue loop, the canonical induction start value
   // needs to be changed from zero to the value after the main vector loop.
@@ -884,12 +887,16 @@ void VPlan::execute(VPTransformState *State) {
                             isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) ||
                             (isa<VPReductionPHIRecipe>(PhiR) &&
                              cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
+    bool NeedsScalar = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
+                       (isa<VPReductionPHIRecipe>(PhiR) &&
+                        cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
     unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
 
     for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
-      Value *Phi = State->get(PhiR, Part);
-      Value *Val = State->get(PhiR->getBackedgeValue(),
-                              SinglePartNeeded ? State->UF - 1 : Part);
+      Value *Phi = State->get(PhiR, Part, NeedsScalar);
+      Value *Val =
+          State->get(PhiR->getBackedgeValue(),
+                     SinglePartNeeded ? State->UF - 1 : Part, NeedsScalar);
       cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
     }
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index aaddaaff2aba..16c09a83e777 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -259,9 +259,10 @@ struct VPTransformState {
     DenseMap<VPValue *, ScalarsPerPartValuesTy> PerPartScalars;
   } Data;
 
-  /// Get the generated Value for the given VPValue \p Def and the given \p Part.
-  /// \see set.
-  Value *get(VPValue *Def, unsigned Part);
+  /// Get the generated vector Value for a given VPValue \p Def and a given \p
+  /// Part if \p IsScalar is false, otherwise return the generated scalar
+  /// for \p Part. \See set.
+  Value *get(VPValue *Def, unsigned Part, bool IsScalar = false);
 
   /// Get the generated Value for a given VPValue and given Part and Lane.
   Value *get(VPValue *Def, const VPIteration &Instance);
@@ -282,14 +283,22 @@ struct VPTransformState {
            I->second[Instance.Part][CacheIdx];
   }
 
-  /// Set the generated Value for a given VPValue and a given Part.
-  void set(VPValue *Def, Value *V, unsigned Part) {
+  /// Set the generated vector Value for a given VPValue and a given Part, if \p
+  /// IsScalar is false. If \p IsScalar is true, set the scalar in (Part, 0).
+  void set(VPValue *Def, Value *V, unsigned Part, bool IsScalar = false) {
+    if (IsScalar) {
+      set(Def, V, VPIteration(Part, 0));
+      return;
+    }
+    assert((VF.isScalar() || V->getType()->isVectorTy()) &&
+           "scalar values must be stored as (Part, 0)");
     if (!Data.PerPartOutput.count(Def)) {
       DataState::PerPartValuesTy Entry(UF);
       Data.PerPartOutput[Def] = Entry;
     }
     Data.PerPartOutput[Def][Part] = V;
   }
+
   /// Reset an existing vector value for \p Def and a given \p Part.
   void reset(VPValue *Def, Value *V, unsigned Part) {
     auto Iter = Data.PerPartOutput.find(Def);
@@ -1376,6 +1385,13 @@ public:
 
   /// Returns the result type of the cast.
   Type *getResultType() const { return ResultTy; }
+
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    // At the moment, only uniform codegen is implemented.
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return true;
+  }
 };
 
 /// A recipe for widening Call instructions.
@@ -2915,6 +2931,14 @@ public:
     return TripCount;
   }
 
+  /// Resets the trip count for the VPlan. The caller must make sure all uses of
+  /// the original trip count have been replaced.
+  void resetTripCount(VPValue *NewTripCount) {
+    assert(TripCount && NewTripCount && TripCount->getNumUsers() == 0 &&
+           "TripCount always must be set");
+    TripCount = NewTripCount;
+  }
+
   /// The backedge taken count of the original loop.
   VPValue *getOrCreateBackedgeTakenCount() {
     if (!BackedgeTakenCount)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2d2f6acf913f..27b72575ddd5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -279,11 +279,12 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
   Builder.SetCurrentDebugLocation(getDebugLoc());
 
   if (Instruction::isBinaryOp(getOpcode())) {
+    bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
     if (Part != 0 && vputils::onlyFirstPartUsed(this))
-      return State.get(this, 0);
+      return State.get(this, 0, OnlyFirstLaneUsed);
 
-    Value *A = State.get(getOperand(0), Part);
-    Value *B = State.get(getOperand(1), Part);
+    Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
+    Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
     auto *Res =
         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
     if (auto *I = dyn_cast<Instruction>(Res))
@@ -385,8 +386,8 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
     if (Part != 0)
       return nullptr;
     // First create the compare.
-    Value *IV = State.get(getOperand(0), Part);
-    Value *TC = State.get(getOperand(1), Part);
+    Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true);
+    Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true);
     Value *Cond = Builder.CreateICmpEQ(IV, TC);
 
     // Now create the branch.
@@ -407,7 +408,7 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
   }
   case VPInstruction::ComputeReductionResult: {
     if (Part != 0)
-      return State.get(this, 0);
+      return State.get(this, 0, /*IsScalar*/ true);
 
     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
     // and will be removed by breaking up the recipe further.
@@ -424,7 +425,7 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
     Type *PhiTy = OrigPhi->getType();
     VectorParts RdxParts(State.UF);
     for (unsigned Part = 0; Part < State.UF; ++Part)
-      RdxParts[Part] = State.get(LoopExitingDef, Part);
+      RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop());
 
     // If the vector reduction can be performed in a smaller type, we truncate
     // then extend the loop exit value to enable InstCombine to evaluate the
@@ -512,9 +513,15 @@ void VPInstruction::execute(VPTransformState &State) {
     if (!hasResult())
       continue;
     assert(GeneratedValue && "generateInstruction must produce a value");
-    State.set(this, GeneratedValue, Part);
+
+    bool IsVector = GeneratedValue->getType()->isVectorTy();
+    State.set(this, GeneratedValue, Part, !IsVector);
+    assert((IsVector || getOpcode() == VPInstruction::ComputeReductionResult ||
+            State.VF.isScalar() || vputils::onlyFirstLaneUsed(this)) &&
+           "scalar value but not only first lane used");
   }
 }
+
 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
   if (Instruction::isBinaryOp(getOpcode()))
@@ -530,8 +537,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   case VPInstruction::CalculateTripCountMinusVF:
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::BranchOnCount:
-    // TODO: Cover additional operands.
-    return getOperand(0) == Op;
+    return true;
   };
   llvm_unreachable("switch should return");
 }
@@ -1344,7 +1350,7 @@ void VPVectorPointerRecipe ::execute(VPTransformState &State) {
       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
     }
 
-    State.set(this, PartPtr, Part);
+    State.set(this, PartPtr, Part, /*IsScalar*/ true);
   }
 }
 
@@ -1640,7 +1646,7 @@ void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
   EntryPart->addIncoming(Start, VectorPH);
   EntryPart->setDebugLoc(getDebugLoc());
   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
-    State.set(this, EntryPart, Part);
+    State.set(this, EntryPart, Part, /*IsScalar*/ true);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1711,7 +1717,7 @@ void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
 #endif
 
 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
-  Value *CanonicalIV = State.get(getOperand(0), 0);
+  Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true);
   Type *STy = CanonicalIV->getType();
   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
   ElementCount VF = State.VF;
@@ -1801,7 +1807,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
     Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");
     EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());
-    State.set(this, EntryPart, Part);
+    State.set(this, EntryPart, Part, IsInLoop);
   }
 
   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
@@ -1833,7 +1839,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
   }
 
   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
-    Value *EntryPart = State.get(this, Part);
+    Value *EntryPart = State.get(this, Part, IsInLoop);
     // Make sure to add the reduction start value only to the
     // first unroll part.
     Value *StartVal = (Part == 0) ? StartV : Iden;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index b114716b7c13..1d2c17e91b7a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -73,12 +73,6 @@ protected:
   // for multiple underlying IRs (Polly?) by providing a new VPlan front-end,
   // back-end and analysis information for the new IR.
 
-  // Set \p Val as the underlying Value of this VPValue.
-  void setUnderlyingValue(Value *Val) {
-    assert(!UnderlyingVal && "Underlying Value is already set.");
-    UnderlyingVal = Val;
-  }
-
 public:
   /// Return the underlying Value attached to this VPValue.
   Value *getUnderlyingValue() { return UnderlyingVal; }
@@ -192,6 +186,12 @@ public:
   /// is a live-in value.
   /// TODO: Also handle recipes defined in pre-header blocks.
   bool isDefinedOutsideVectorRegions() const { return !hasDefiningRecipe(); }
+
+  // Set \p Val as the underlying Value of this VPValue.
+  void setUnderlyingValue(Value *Val) {
+    assert(!UnderlyingVal && "Underlying Value is already set.");
+    UnderlyingVal = Val;
+  }
 };
 
 typedef DenseMap<Value *, VPValue *> Value2VPValueTy;
diff --git a/llvm/test/Analysis/CostModel/AArch64/ext-rhadd.ll b/llvm/test/Analysis/CostModel/AArch64/ext-rhadd.ll
index 94a37d944fc2..34791e876f11 100644
--- a/llvm/test/Analysis/CostModel/AArch64/ext-rhadd.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/ext-rhadd.ll
@@ -37,9 +37,9 @@ define void @srhadd_i8_sext_i16_scalable(ptr %a, ptr %b, ptr %dst) {
   %ld2 = load <vscale x 16 x i8>, ptr %b
   %ext1 = sext <vscale x 16 x i8> %ld1 to <vscale x 16 x i16>
   %ext2 = sext <vscale x 16 x i8> %ld2 to <vscale x 16 x i16>
-  %add1 = add nuw nsw <vscale x 16 x i16> %ext1, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %add1 = add nuw nsw <vscale x 16 x i16> %ext1, splat (i16 1)
   %add2 = add nuw nsw <vscale x 16 x i16> %add1, %ext2
-  %shr = lshr <vscale x 16 x i16> %add2, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %shr = lshr <vscale x 16 x i16> %add2, splat (i16 1)
   %trunc = trunc <vscale x 16 x i16> %shr to <vscale x 16 x i8>
   store <vscale x 16 x i8> %trunc, ptr %a
   ret void
@@ -58,9 +58,9 @@ define void @srhadd_i16_sext_i64_scalable(ptr %a, ptr %b, ptr %dst) {
   %ld2 = load <vscale x 8 x i16>, ptr %b
   %ext1 = sext <vscale x 8 x i16> %ld1 to <vscale x 8 x i64>
   %ext2 = sext <vscale x 8 x i16> %ld2 to <vscale x 8 x i64>
-  %add1 = add nuw nsw <vscale x 8 x i64> %ext1, shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+  %add1 = add nuw nsw <vscale x 8 x i64> %ext1, splat (i64 1)
   %add2 = add nuw nsw <vscale x 8 x i64> %add1, %ext2
-  %shr = lshr <vscale x 8 x i64> %add2, shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+  %shr = lshr <vscale x 8 x i64> %add2, splat (i64 1)
   %trunc = trunc <vscale x 8 x i64> %shr to <vscale x 8 x i16>
   store <vscale x 8 x i16> %trunc, ptr %a
   ret void
@@ -102,9 +102,9 @@ define void @urhadd_i8_zext_i64(ptr %a, ptr %b, ptr %dst) {
   %ld2 = load <vscale x 16 x i8>, ptr %b
   %ext1 = zext <vscale x 16 x i8> %ld1 to <vscale x 16 x i64>
   %ext2 = zext <vscale x 16 x i8> %ld2 to <vscale x 16 x i64>
-  %add1 = add nuw nsw <vscale x 16 x i64> %ext1, shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+  %add1 = add nuw nsw <vscale x 16 x i64> %ext1, splat (i64 1)
   %add2 = add nuw nsw <vscale x 16 x i64> %add1, %ext2
-  %shr = lshr <vscale x 16 x i64> %add2, shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+  %shr = lshr <vscale x 16 x i64> %add2, splat (i64 1)
   %trunc = trunc <vscale x 16 x i64> %shr to <vscale x 16 x i8>
   store <vscale x 16 x i8> %trunc, ptr %a
   ret void
@@ -123,9 +123,9 @@ define void @urhadd_i16_zext_i32(ptr %a, ptr %b, ptr %dst) {
   %ld2 = load <vscale x 8 x i16>, ptr %b
   %ext1 = zext <vscale x 8 x i16> %ld1 to <vscale x 8 x i32>
   %ext2 = zext <vscale x 8 x i16> %ld2 to <vscale x 8 x i32>
-  %add1 = add nuw nsw <vscale x 8 x i32> %ext1, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %add1 = add nuw nsw <vscale x 8 x i32> %ext1, splat (i32 1)
   %add2 = add nuw nsw <vscale x 8 x i32> %add1, %ext2
-  %shr = lshr <vscale x 8 x i32> %add2, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %shr = lshr <vscale x 8 x i32> %add2, splat (i32 1)
   %trunc = trunc <vscale x 8 x i32> %shr to <vscale x 8 x i16>
   store <vscale x 8 x i16> %trunc, ptr %a
   ret void
@@ -146,9 +146,9 @@ define void @ext_operand_mismatch(ptr %a, ptr %b, ptr %dst) {
   %ld2 = load <vscale x 16 x i8>, ptr %b
   %ext1 = sext <vscale x 16 x i8> %ld1 to <vscale x 16 x i16>
   %ext2 = zext <vscale x 16 x i8> %ld2 to <vscale x 16 x i16>
-  %add1 = add nuw nsw <vscale x 16 x i16> %ext1, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %add1 = add nuw nsw <vscale x 16 x i16> %ext1, splat (i16 1)
   %add2 = add nuw nsw <vscale x 16 x i16> %add1, %ext2
-  %shr = lshr <vscale x 16 x i16> %add2, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %shr = lshr <vscale x 16 x i16> %add2, splat (i16 1)
   %trunc = trunc <vscale x 16 x i16> %shr to <vscale x 16 x i8>
   store <vscale x 16 x i8> %trunc, ptr %a
   ret void
@@ -167,9 +167,9 @@ define void @add_multiple_uses(ptr %a, ptr %b, ptr %dst) {
   %ld2 = load <vscale x 8 x i16>, ptr %b
   %ext1 = sext <vscale x 8 x i16> %ld1 to <vscale x 8 x i32>
   %ext2 = sext <vscale x 8 x i16> %ld2 to <vscale x 8 x i32>
-  %add1 = add nuw nsw <vscale x 8 x i32> %ext1, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %add1 = add nuw nsw <vscale x 8 x i32> %ext1, splat (i32 1)
   %add2 = add nuw nsw <vscale x 8 x i32> %add1, %ext2
-  %shr = lshr <vscale x 8 x i32> %add2, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %shr = lshr <vscale x 8 x i32> %add2, splat (i32 1)
   %trunc = trunc <vscale x 8 x i32> %shr to <vscale x 8 x i16>
   %add.res = add nuw nsw <vscale x 8 x i32> %add1, %add2
   %res = trunc <vscale x 8 x i32> %add.res to <vscale x 8 x i16>
@@ -190,9 +190,9 @@ define void @shift_multiple_uses(ptr %a, ptr %b, ptr %dst) {
   %ld2 = load <vscale x 16 x i8>, ptr %b
   %ext1 = zext <vscale x 16 x i8> %ld1 to <vscale x 16 x i16>
   %ext2 = zext <vscale x 16 x i8> %ld2 to <vscale x 16 x i16>
-  %add1 = add nuw nsw <vscale x 16 x i16> %ext1, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %add1 = add nuw nsw <vscale x 16 x i16> %ext1, splat (i16 1)
   %add2 = add nuw nsw <vscale x 16 x i16> %add1, %ext2
-  %shr = lshr <vscale x 16 x i16> %add2, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %shr = lshr <vscale x 16 x i16> %add2, splat (i16 1)
   %trunc = trunc <vscale x 16 x i16> %shr to <vscale x 16 x i8>
   %add3 = add nuw nsw <vscale x 16 x i16> %shr, %add2
   %res = trunc <vscale x 16 x i16> %add3 to <vscale x 16 x i8>
diff --git a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
index 194fe5be40c2..76790d128d06 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
@@ -595,30 +595,14 @@ entry:
 }
 
 define i16 @sminv_v3i16(<3 x i16> %a) {
-; CHECK-SD-LABEL: sminv_v3i16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-SD-NEXT:    mov v0.h[3], w8
-; CHECK-SD-NEXT:    sminv h0, v0.4h
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: sminv_v3i16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    smov w8, v0.h[0]
-; CHECK-GI-NEXT:    umov w9, v0.h[0]
-; CHECK-GI-NEXT:    umov w10, v0.h[1]
-; CHECK-GI-NEXT:    smov w11, v0.h[2]
-; CHECK-GI-NEXT:    umov w13, v0.h[2]
-; CHECK-GI-NEXT:    fmov w12, s1
-; CHECK-GI-NEXT:    cmp w8, w12, sxth
-; CHECK-GI-NEXT:    csel w8, w9, w10, lt
-; CHECK-GI-NEXT:    cmp w11, w8, sxth
-; CHECK-GI-NEXT:    csel w0, w8, w13, gt
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: sminv_v3i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-NEXT:    mov v0.h[3], w8
+; CHECK-NEXT:    sminv h0, v0.4h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
 entry:
   %arg1 = call i16 @llvm.vector.reduce.smin.v3i16(<3 x i16> %a)
   ret i16 %arg1
@@ -670,28 +654,13 @@ entry:
 }
 
 define i32 @sminv_v3i32(<3 x i32> %a) {
-; CHECK-SD-LABEL: sminv_v3i32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mov w8, #2147483647 // =0x7fffffff
-; CHECK-SD-NEXT:    mov v0.s[3], w8
-; CHECK-SD-NEXT:    sminv s0, v0.4s
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: sminv_v3i32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    fmov w9, s2
-; CHECK-GI-NEXT:    fcsel s0, s0, s1, lt
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    fcsel s0, s0, s2, lt
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: sminv_v3i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #2147483647 // =0x7fffffff
+; CHECK-NEXT:    mov v0.s[3], w8
+; CHECK-NEXT:    sminv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
 entry:
   %arg1 = call i32 @llvm.vector.reduce.smin.v3i32(<3 x i32> %a)
   ret i32 %arg1
@@ -972,17 +941,10 @@ define i16 @smaxv_v3i16(<3 x i16> %a) {
 ; CHECK-GI-LABEL: smaxv_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    smov w8, v0.h[0]
-; CHECK-GI-NEXT:    umov w9, v0.h[0]
-; CHECK-GI-NEXT:    umov w10, v0.h[1]
-; CHECK-GI-NEXT:    smov w11, v0.h[2]
-; CHECK-GI-NEXT:    umov w13, v0.h[2]
-; CHECK-GI-NEXT:    fmov w12, s1
-; CHECK-GI-NEXT:    cmp w8, w12, sxth
-; CHECK-GI-NEXT:    csel w8, w9, w10, gt
-; CHECK-GI-NEXT:    cmp w11, w8, sxth
-; CHECK-GI-NEXT:    csel w0, w8, w13, lt
+; CHECK-GI-NEXT:    mov w8, #32768 // =0x8000
+; CHECK-GI-NEXT:    mov v0.h[3], w8
+; CHECK-GI-NEXT:    smaxv h0, v0.4h
+; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i16 @llvm.vector.reduce.smax.v3i16(<3 x i16> %a)
@@ -1035,28 +997,13 @@ entry:
 }
 
 define i32 @smaxv_v3i32(<3 x i32> %a) {
-; CHECK-SD-LABEL: smaxv_v3i32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mov w8, #-2147483648 // =0x80000000
-; CHECK-SD-NEXT:    mov v0.s[3], w8
-; CHECK-SD-NEXT:    smaxv s0, v0.4s
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: smaxv_v3i32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    fmov w9, s2
-; CHECK-GI-NEXT:    fcsel s0, s0, s1, gt
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    fcsel s0, s0, s2, gt
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: smaxv_v3i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #-2147483648 // =0x80000000
+; CHECK-NEXT:    mov v0.s[3], w8
+; CHECK-NEXT:    smaxv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
 entry:
   %arg1 = call i32 @llvm.vector.reduce.smax.v3i32(<3 x i32> %a)
   ret i32 %arg1
@@ -1335,17 +1282,10 @@ define i16 @uminv_v3i16(<3 x i16> %a) {
 ; CHECK-GI-LABEL: uminv_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    umov w8, v0.h[0]
-; CHECK-GI-NEXT:    umov w9, v0.h[0]
-; CHECK-GI-NEXT:    umov w10, v0.h[1]
-; CHECK-GI-NEXT:    umov w11, v0.h[2]
-; CHECK-GI-NEXT:    umov w13, v0.h[2]
-; CHECK-GI-NEXT:    fmov w12, s1
-; CHECK-GI-NEXT:    cmp w8, w12, uxth
-; CHECK-GI-NEXT:    csel w8, w9, w10, lo
-; CHECK-GI-NEXT:    cmp w11, w8, uxth
-; CHECK-GI-NEXT:    csel w0, w8, w13, hi
+; CHECK-GI-NEXT:    mov w8, #65535 // =0xffff
+; CHECK-GI-NEXT:    mov v0.h[3], w8
+; CHECK-GI-NEXT:    uminv h0, v0.4h
+; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i16 @llvm.vector.reduce.umin.v3i16(<3 x i16> %a)
@@ -1398,28 +1338,13 @@ entry:
 }
 
 define i32 @uminv_v3i32(<3 x i32> %a) {
-; CHECK-SD-LABEL: uminv_v3i32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-SD-NEXT:    mov v0.s[3], w8
-; CHECK-SD-NEXT:    uminv s0, v0.4s
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: uminv_v3i32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    fmov w9, s2
-; CHECK-GI-NEXT:    fcsel s0, s0, s1, lo
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    fcsel s0, s0, s2, lo
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: uminv_v3i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    mov v0.s[3], w8
+; CHECK-NEXT:    uminv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
 entry:
   %arg1 = call i32 @llvm.vector.reduce.umin.v3i32(<3 x i32> %a)
   ret i32 %arg1
@@ -1697,17 +1622,10 @@ define i16 @umaxv_v3i16(<3 x i16> %a) {
 ; CHECK-GI-LABEL: umaxv_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    umov w8, v0.h[0]
-; CHECK-GI-NEXT:    umov w9, v0.h[0]
-; CHECK-GI-NEXT:    umov w10, v0.h[1]
-; CHECK-GI-NEXT:    umov w11, v0.h[2]
-; CHECK-GI-NEXT:    umov w13, v0.h[2]
-; CHECK-GI-NEXT:    fmov w12, s1
-; CHECK-GI-NEXT:    cmp w8, w12, uxth
-; CHECK-GI-NEXT:    csel w8, w9, w10, hi
-; CHECK-GI-NEXT:    cmp w11, w8, uxth
-; CHECK-GI-NEXT:    csel w0, w8, w13, lo
+; CHECK-GI-NEXT:    mov w8, #0 // =0x0
+; CHECK-GI-NEXT:    mov v0.h[3], w8
+; CHECK-GI-NEXT:    umaxv h0, v0.4h
+; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i16 @llvm.vector.reduce.umax.v3i16(<3 x i16> %a)
@@ -1760,27 +1678,12 @@ entry:
 }
 
 define i32 @umaxv_v3i32(<3 x i32> %a) {
-; CHECK-SD-LABEL: umaxv_v3i32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mov v0.s[3], wzr
-; CHECK-SD-NEXT:    umaxv s0, v0.4s
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umaxv_v3i32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    fmov w9, s2
-; CHECK-GI-NEXT:    fcsel s0, s0, s1, hi
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    fcsel s0, s0, s2, hi
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umaxv_v3i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.s[3], wzr
+; CHECK-NEXT:    umaxv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
 entry:
   %arg1 = call i32 @llvm.vector.reduce.umax.v3i32(<3 x i32> %a)
   ret i32 %arg1
diff --git a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll
index 7244ac949ab8..9a4e01a29ecb 100644
--- a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll
+++ b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll
@@ -14,12 +14,12 @@ define void @array_1D(ptr %addr) #0 {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT:    st1d { z2.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    st1d { z2.d }, p0, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #3
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -81,18 +81,18 @@ define void @array_2D(ptr %addr) #0 {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 48 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, #5, mul vl]
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x0, #4, mul vl]
-; CHECK-NEXT:    ld1d { z4.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x0, #3, mul vl]
-; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #5, mul vl]
-; CHECK-NEXT:    st1d { z3.d }, p0, [sp, #4, mul vl]
-; CHECK-NEXT:    st1d { z5.d }, p0, [sp, #3, mul vl]
-; CHECK-NEXT:    st1d { z4.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT:    st1d { z2.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, #5, mul vl]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT:    ld1d { z4.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp, #5, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #4, mul vl]
+; CHECK-NEXT:    st1d { z3.d }, p0, [sp, #3, mul vl]
+; CHECK-NEXT:    st1d { z5.d }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    st1d { z4.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    st1d { z2.d }, p0, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #6
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll
index f03a6f018d34..7292d52aaf47 100644
--- a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll
+++ b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll
@@ -13,12 +13,12 @@ define void @test(ptr %addr) #0 {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT:    st1d { z2.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    st1d { z2.d }, p0, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #3
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
index 0083818def15..bb9ba05f7a27 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
@@ -147,7 +147,7 @@ define void @has_varargs(...) nounwind {
 ; CHECK-NEXT:     add     x29, sp, #160
 ; CHECK-NEXT:     .seh_add_fp     160
 ; CHECK-NEXT:     .seh_endprologue
-; CHECK-NEXT:     mov     x4, sp
+; CHECK-NEXT:     add     x4, x4, #32
 ; CHECK-NEXT:     mov     x5, xzr
 ; CHECK-NEXT:     blr     x9
 ; CHECK-NEXT:     adrp    x8, __os_arm64x_dispatch_ret
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
index 2627f2a4fb5e..742a7099559f 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
@@ -40,11 +40,11 @@ entry:
   %7 = fmul fast <vscale x 2 x double> %2, %0
   %8 = fmul fast <vscale x 2 x double> %3, %1
   %9 = fsub fast <vscale x 2 x double> %7, %8
-  %10 = fmul fast <vscale x 2 x double> %9, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
-  %11 = fmul fast <vscale x 2 x double> %6, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.100000e+01, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %10 = fmul fast <vscale x 2 x double> %9, splat (double 3.000000e+00)
+  %11 = fmul fast <vscale x 2 x double> %6, splat (double 1.100000e+01)
   %12 = fadd fast <vscale x 2 x double> %10, %11
-  %13 = fmul fast <vscale x 2 x double> %9, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.100000e+01, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
-  %14 = fmul fast <vscale x 2 x double> %6, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %13 = fmul fast <vscale x 2 x double> %9, splat (double 1.100000e+01)
+  %14 = fmul fast <vscale x 2 x double> %6, splat (double 3.000000e+00)
   %15 = fsub fast <vscale x 2 x double> %13, %14
   %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %15, <vscale x 2 x double> %12)
   ret <vscale x 4 x double> %interleaved.vec
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
index 4e50cb11be71..4a2e85c715f7 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
@@ -63,7 +63,7 @@ define fastcc i8 @allocno_reload_assign() {
   br label %1
 
 1:                                                ; preds = %1, %0
-  call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x ptr> zeroinitializer, i32 0, <vscale x 16 x i1> xor (<vscale x 16 x i1> shufflevector (<vscale x 16 x i1> icmp eq (<vscale x 16 x ptr> insertelement (<vscale x 16 x ptr> poison, ptr null, i64 0), <vscale x 16 x ptr> zeroinitializer), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i32 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)))
+  call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x ptr> zeroinitializer, i32 0, <vscale x 16 x i1> xor (<vscale x 16 x i1> shufflevector (<vscale x 16 x i1> icmp eq (<vscale x 16 x ptr> insertelement (<vscale x 16 x ptr> poison, ptr null, i64 0), <vscale x 16 x ptr> zeroinitializer), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> splat (i1 true)))
   br label %1
 }
 
diff --git a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
index 8f3100c82772..03e64f8b785b 100644
--- a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -525,12 +525,10 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
 define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
 ; CHECK-LABEL: fdiv_pow_shl_cnt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #8 // =0x8
-; CHECK-NEXT:    and x9, x0, #0x1f
-; CHECK-NEXT:    fmov s1, #-0.50000000
-; CHECK-NEXT:    lsl x8, x8, x9
-; CHECK-NEXT:    scvtf s0, x8
-; CHECK-NEXT:    fdiv s0, s1, s0
+; CHECK-NEXT:    mov w8, #-1115684864 // =0xbd800000
+; CHECK-NEXT:    and w9, w0, #0x1f
+; CHECK-NEXT:    sub w8, w8, w9, lsl #23
+; CHECK-NEXT:    fmov s0, w8
 ; CHECK-NEXT:    ret
   %cnt = and i64 %cnt_in, 31
   %shl = shl i64 8, %cnt
@@ -613,9 +611,9 @@ define <vscale x 4 x float> @fdiv_pow2_nx4xfloat(<vscale x 4 x i32> %i) "target-
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    fdivr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %p2 = shl <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %i
+  %p2 = shl <vscale x 4 x i32> splat (i32 1), %i
   %p2_f = uitofp <vscale x 4 x i32> %p2 to <vscale x 4 x float>
-  %r = fdiv <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 9.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), %p2_f
+  %r = fdiv <vscale x 4 x float> splat (float 9.000000e+00), %p2_f
   ret <vscale x 4 x float> %r
 }
 
@@ -628,6 +626,6 @@ define <vscale x 2 x double> @scalable2(<vscale x 2 x i64> %0) "target-features"
 ; CHECK-NEXT:    fdivr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %2 = uitofp <vscale x 2 x i64> %0 to <vscale x 2 x double>
-  %3 = fdiv <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer), %2
+  %3 = fdiv <vscale x 2 x double> splat (double 1.000000e+00), %2
   ret <vscale x 2 x double> %3
 }
diff --git a/llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll b/llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll
index ddf98b2c971e..a949eaac5cfa 100644
--- a/llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll
+++ b/llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll
@@ -134,5 +134,90 @@ entry:
 }
 
 
+define <16 x i8> @insertzero_v8i8(<8 x i8> %a) {
+; CHECK-LABEL: insertzero_v8i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, d0
+; CHECK-NEXT:    ret
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle.i
+}
+
+define <8 x i16> @insertzero_v4i16(<4 x i16> %a) {
+; CHECK-LABEL: insertzero_v4i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, d0
+; CHECK-NEXT:    ret
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @insertzero_v2i32(<2 x i32> %a) {
+; CHECK-LABEL: insertzero_v2i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, d0
+; CHECK-NEXT:    ret
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @insertzero_v1i64(<1 x i64> %a) {
+; CHECK-LABEL: insertzero_v1i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, d0
+; CHECK-NEXT:    ret
+entry:
+  %shuffle.i = shufflevector <1 x i64> %a, <1 x i64> zeroinitializer, <2 x i32> <i32 0, i32 1>
+  ret <2 x i64> %shuffle.i
+}
+
+define <8 x half> @insertzero_v4f16(<4 x half> %a) {
+; CHECK-LABEL: insertzero_v4f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, d0
+; CHECK-NEXT:    ret
+entry:
+  %shuffle.i = shufflevector <4 x half> %a, <4 x half> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x half> %shuffle.i
+}
+
+define <8 x bfloat> @insertzero_v4bf16(<4 x bfloat> %a) {
+; CHECK-LABEL: insertzero_v4bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d4, #0000000000000000
+; CHECK-NEXT:    movi d5, #0000000000000000
+; CHECK-NEXT:    movi d6, #0000000000000000
+; CHECK-NEXT:    movi d7, #0000000000000000
+; CHECK-NEXT:    ret
+entry:
+  %shuffle.i = shufflevector <4 x bfloat> %a, <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x bfloat> %shuffle.i
+}
+
+define <4 x float> @insertzero_v2f32(<2 x float> %a) {
+; CHECK-LABEL: insertzero_v2f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, d0
+; CHECK-NEXT:    ret
+entry:
+  %shuffle.i = shufflevector <2 x float> %a, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %shuffle.i
+}
+
+define <2 x double> @insertzero_v1f64(<1 x double> %a) {
+; CHECK-LABEL: insertzero_v1f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, d0
+; CHECK-NEXT:    ret
+entry:
+  %shuffle.i = shufflevector <1 x double> %a, <1 x double> zeroinitializer, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %shuffle.i
+}
+
+
+
 declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)
 declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>)
diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
new file mode 100644
index 000000000000..7f4540d915ab
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -0,0 +1,318 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; ===== Legal Scalars =====
+
+define i8 @load_i8(ptr %ptr){
+; CHECK-LABEL: load_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrb w0, [x0]
+; CHECK-NEXT:    ret
+    %a = load i8 , ptr %ptr
+    ret i8 %a
+}
+
+define i16 @load_i16(ptr %ptr){
+; CHECK-LABEL: load_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w0, [x0]
+; CHECK-NEXT:    ret
+    %a = load i16 , ptr %ptr
+    ret i16 %a
+}
+
+define i32 @load_i32(ptr %ptr){
+; CHECK-LABEL: load_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w0, [x0]
+; CHECK-NEXT:    ret
+    %a = load i32 , ptr %ptr
+    ret i32 %a
+}
+
+define i64 @load_i64(ptr %ptr){
+; CHECK-LABEL: load_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x0, [x0]
+; CHECK-NEXT:    ret
+    %a = load i64 , ptr %ptr
+    ret i64 %a
+}
+
+; ===== Legal Vector Types =====
+
+define <8 x i8> @load_v8i8(ptr %ptr){
+; CHECK-LABEL: load_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <8 x i8>, ptr %ptr
+    ret <8 x i8> %a
+}
+
+define <16 x i8> @load_v16i8(ptr %ptr){
+; CHECK-LABEL: load_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <16 x i8>, ptr %ptr
+    ret <16 x i8> %a
+}
+
+define <4 x i16> @load_v4i16(ptr %ptr){
+; CHECK-LABEL: load_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <4 x i16>, ptr %ptr
+    ret <4 x i16> %a
+}
+
+define <8 x i16> @load_v8i16(ptr %ptr){
+; CHECK-LABEL: load_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <8 x i16>, ptr %ptr
+    ret <8 x i16> %a
+}
+
+define <2 x i32> @load_v2i32(ptr %ptr){
+; CHECK-LABEL: load_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <2 x i32>, ptr %ptr
+    ret <2 x i32> %a
+}
+
+define <4 x i32> @load_v4i32(ptr %ptr){
+; CHECK-LABEL: load_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <4 x i32>, ptr %ptr
+    ret <4 x i32> %a
+}
+
+define <2 x i64> @load_v2i64(ptr %ptr){
+; CHECK-LABEL: load_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <2 x i64>, ptr %ptr
+    ret <2 x i64> %a
+}
+
+; ===== Smaller/Larger Width Vectors with Legal Element Sizes =====
+
+define <2 x i8> @load_v2i8(ptr %ptr, <2 x i8> %b){
+; CHECK-SD-LABEL: load_v2i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-SD-NEXT:    add x8, x0, #1
+; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: load_v2i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr b0, [x0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #1]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+    %a = load <2 x i8>, ptr %ptr
+    ret <2 x i8> %a
+}
+
+define i32 @load_v4i8(ptr %ptr, <4 x i8> %b){
+; CHECK-LABEL: load_v4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <4 x i8>, ptr %ptr
+    %c = bitcast <4 x i8> %a to i32
+    ret i32 %c
+}
+
+define <32 x i8> @load_v32i8(ptr %ptr){
+; CHECK-LABEL: load_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ret
+    %a = load <32 x i8>, ptr %ptr
+    ret <32 x i8> %a
+}
+
+define <2 x i16> @load_v2i16(ptr %ptr){
+; CHECK-SD-LABEL: load_v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
+; CHECK-SD-NEXT:    add x8, x0, #2
+; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: load_v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+    %a = load <2 x i16>, ptr %ptr
+    ret <2 x i16> %a
+}
+
+define <16 x i16> @load_v16i16(ptr %ptr){
+; CHECK-LABEL: load_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ret
+    %a = load <16 x i16>, ptr %ptr
+    ret <16 x i16> %a
+}
+
+define <1 x i32> @load_v1i32(ptr %ptr){
+; CHECK-LABEL: load_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <1 x i32>, ptr %ptr
+    ret <1 x i32> %a
+}
+
+define <8 x i32> @load_v8i32(ptr %ptr){
+; CHECK-LABEL: load_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ret
+    %a = load <8 x i32>, ptr %ptr
+    ret <8 x i32> %a
+}
+
+define <4 x i64> @load_v4i64(ptr %ptr){
+; CHECK-LABEL: load_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ret
+    %a = load <4 x i64>, ptr %ptr
+    ret <4 x i64> %a
+}
+
+; ===== Vectors with Non-Pow 2 Widths =====
+
+define <3 x i8> @load_v3i8(ptr %ptr){
+; CHECK-SD-LABEL: load_v3i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    umov w0, v0.b[0]
+; CHECK-SD-NEXT:    umov w1, v0.b[1]
+; CHECK-SD-NEXT:    umov w2, v0.b[2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: load_v3i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldrb w8, [x0]
+; CHECK-GI-NEXT:    ldrb w1, [x0, #1]
+; CHECK-GI-NEXT:    ldrb w2, [x0, #2]
+; CHECK-GI-NEXT:    mov w0, w8
+; CHECK-GI-NEXT:    ret
+    %a = load <3 x i8>, ptr %ptr
+    ret <3 x i8> %a
+}
+
+define <7 x i8> @load_v7i8(ptr %ptr){
+; CHECK-SD-LABEL: load_v7i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: load_v7i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr b0, [x0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #1]
+; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #2]
+; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #3]
+; CHECK-GI-NEXT:    mov v0.b[3], v1.b[0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #4]
+; CHECK-GI-NEXT:    mov v0.b[4], v1.b[0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #5]
+; CHECK-GI-NEXT:    mov v0.b[5], v1.b[0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #6]
+; CHECK-GI-NEXT:    mov v0.b[6], v1.b[0]
+; CHECK-GI-NEXT:    mov v0.b[7], v0.b[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+    %a = load <7 x i8>, ptr %ptr
+    ret <7 x i8> %a
+}
+
+define <3 x i16> @load_v3i16(ptr %ptr){
+; CHECK-SD-LABEL: load_v3i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: load_v3i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #4]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+    %a = load <3 x i16>, ptr %ptr
+    ret <3 x i16> %a
+}
+
+define <7 x i16> @load_v7i16(ptr %ptr){
+; CHECK-SD-LABEL: load_v7i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: load_v7i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #4]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #6]
+; CHECK-GI-NEXT:    mov v0.h[3], v1.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #8]
+; CHECK-GI-NEXT:    mov v0.h[4], v1.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #10]
+; CHECK-GI-NEXT:    mov v0.h[5], v1.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #12]
+; CHECK-GI-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[7], v0.h[0]
+; CHECK-GI-NEXT:    ret
+    %a = load <7 x i16>, ptr %ptr
+    ret <7 x i16> %a
+}
+
+define <3 x i32> @load_v3i32(ptr %ptr){
+; CHECK-SD-LABEL: load_v3i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: load_v3i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldp s0, s1, [x0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT:    ldr s1, [x0, #8]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
+; CHECK-GI-NEXT:    ret
+    %a = load <3 x i32>, ptr %ptr
+    ret <3 x i32> %a
+}
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index 2a78012045ff..cd348be5d771 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -fast-isel=true -global-isel=false -fast-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sme < %s \
+; RUN: llc -fast-isel=true -global-isel=false -fast-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s \
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-FISEL
-; RUN: llc -fast-isel=false -global-isel=true -global-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sme < %s \
+; RUN: llc -fast-isel=false -global-isel=true -global-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s \
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-GISEL
 
 
@@ -447,3 +447,64 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati
   %res = frem float %a, %b
   ret float %res
 }
+
+;
+; Check ZT0 State
+;
+
+declare double @zt0_shared_callee(double) "aarch64_inout_zt0"
+
+define double  @zt0_new_caller_to_zt0_shared_callee(double %x) nounwind noinline optnone "aarch64_new_zt0" {
+; CHECK-COMMON-LABEL: zt0_new_caller_to_zt0_shared_callee:
+; CHECK-COMMON:       // %bb.0: // %prelude
+; CHECK-COMMON-NEXT:    sub sp, sp, #80
+; CHECK-COMMON-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-COMMON-NEXT:    cbz x8, .LBB13_2
+; CHECK-COMMON-NEXT:    b .LBB13_1
+; CHECK-COMMON-NEXT:  .LBB13_1: // %save.za
+; CHECK-COMMON-NEXT:    mov x8, sp
+; CHECK-COMMON-NEXT:    str zt0, [x8]
+; CHECK-COMMON-NEXT:    bl __arm_tpidr2_save
+; CHECK-COMMON-NEXT:    ldr zt0, [x8]
+; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-COMMON-NEXT:    b .LBB13_2
+; CHECK-COMMON-NEXT:  .LBB13_2: // %entry
+; CHECK-COMMON-NEXT:    smstart za
+; CHECK-COMMON-NEXT:    zero { zt0 }
+; CHECK-COMMON-NEXT:    bl zt0_shared_callee
+; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
+; CHECK-COMMON-NEXT:    fmov d1, x8
+; CHECK-COMMON-NEXT:    fadd d0, d0, d1
+; CHECK-COMMON-NEXT:    smstop za
+; CHECK-COMMON-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    add sp, sp, #80
+; CHECK-COMMON-NEXT:    ret
+entry:
+  %call = call double @zt0_shared_callee(double %x)
+  %add = fadd double %call, 4.200000e+01
+  ret double %add;
+}
+
+define double  @zt0_shared_caller_to_normal_callee(double %x) nounwind noinline optnone "aarch64_inout_zt0" {
+; CHECK-COMMON-LABEL: zt0_shared_caller_to_normal_callee:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    sub sp, sp, #80
+; CHECK-COMMON-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    mov x19, sp
+; CHECK-COMMON-NEXT:    str zt0, [x19]
+; CHECK-COMMON-NEXT:    smstop za
+; CHECK-COMMON-NEXT:    bl normal_callee
+; CHECK-COMMON-NEXT:    smstart za
+; CHECK-COMMON-NEXT:    ldr zt0, [x19]
+; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
+; CHECK-COMMON-NEXT:    fmov d1, x8
+; CHECK-COMMON-NEXT:    fadd d0, d0, d1
+; CHECK-COMMON-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    add sp, sp, #80
+; CHECK-COMMON-NEXT:    ret
+entry:
+  %call = call double @normal_callee(double %x)
+  %add = fadd double %call, 4.200000e+01
+  ret double %add;
+}
diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll
new file mode 100644
index 000000000000..bf22d79a4df9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/store.ll
@@ -0,0 +1,342 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; ===== Legal Scalars =====
+define void @store_i8(i8 %a, ptr %ptr){
+; CHECK-LABEL: store_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    strb w0, [x1]
+; CHECK-NEXT:    ret
+    store i8 %a, ptr %ptr
+    ret void
+}
+
+define void @store_i16(i16 %a, ptr %ptr){
+; CHECK-LABEL: store_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    strh w0, [x1]
+; CHECK-NEXT:    ret
+    store i16 %a, ptr %ptr
+    ret void
+}
+
+define void @store_i32(i32 %a, ptr %ptr){
+; CHECK-LABEL: store_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str w0, [x1]
+; CHECK-NEXT:    ret
+    store i32 %a, ptr %ptr
+    ret void
+}
+
+define void @store_i64(i64 %a, ptr %ptr){
+; CHECK-LABEL: store_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x0, [x1]
+; CHECK-NEXT:    ret
+    store i64 %a, ptr %ptr
+    ret void
+}
+
+; ===== Legal Vector Types =====
+
+define void @store_v8i8(<8 x i8> %a, ptr %ptr){
+; CHECK-LABEL: store_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+    store <8 x i8> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v16i8(<16 x i8> %a, ptr %ptr){
+; CHECK-LABEL: store_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
+    store <16 x i8> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v4i16(<4 x i16> %a, ptr %ptr){
+; CHECK-LABEL: store_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+    store <4 x i16> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v8i16(<8 x i16> %a, ptr %ptr){
+; CHECK-LABEL: store_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
+    store <8 x i16> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v2i32(<2 x i32> %a, ptr %ptr){
+; CHECK-LABEL: store_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+    store <2 x i32> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v4i32(<4 x i32> %a, ptr %ptr){
+; CHECK-LABEL: store_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
+    store <4 x i32> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v2i64(<2 x i64> %a, ptr %ptr){
+; CHECK-LABEL: store_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
+    store <2 x i64> %a, ptr %ptr
+    ret void
+}
+
+; ===== Smaller/Larger Width Vectors with Legal Element Sizes =====
+
+define void @store_v2i8(<2 x i8> %a, ptr %ptr){
+; CHECK-SD-LABEL: store_v2i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strb w9, [x0]
+; CHECK-SD-NEXT:    strb w8, [x0, #1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v2i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    str b0, [x0]
+; CHECK-GI-NEXT:    str b1, [x0, #1]
+; CHECK-GI-NEXT:    ret
+    store <2 x i8> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v4i8(i32 %a, ptr %ptr) {
+; CHECK-LABEL: store_v4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str w0, [x1]
+; CHECK-NEXT:    ret
+    %c = bitcast i32 %a to <4 x i8>
+    store <4 x i8> %c, ptr %ptr
+    ret void
+}
+
+define void @store_v32i8(<32 x i8> %a, ptr %ptr){
+; CHECK-LABEL: store_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    ret
+    store <32 x i8> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v2i16(<2 x i16> %a, ptr %ptr){
+; CHECK-SD-LABEL: store_v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strh w9, [x0]
+; CHECK-SD-NEXT:    strh w8, [x0, #2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    str h0, [x0]
+; CHECK-GI-NEXT:    str h1, [x0, #2]
+; CHECK-GI-NEXT:    ret
+    store <2 x i16> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v16i16(<16 x i16> %a, ptr %ptr){
+; CHECK-LABEL: store_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    ret
+    store <16 x i16> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v1i32(<1 x i32> %a, ptr %ptr){
+; CHECK-SD-LABEL: store_v1i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    str s0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v1i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    str s0, [x0]
+; CHECK-GI-NEXT:    ret
+    store <1 x i32> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v8i32(<8 x i32> %a, ptr %ptr){
+; CHECK-LABEL: store_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    ret
+    store <8 x i32> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v4i64(<4 x i64> %a, ptr %ptr){
+; CHECK-LABEL: store_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    ret
+    store <4 x i64> %a, ptr %ptr
+    ret void
+}
+
+; ===== Vectors with Non-Pow 2 Widths =====
+
+define void @store_v3i8(<3 x i8> %a, ptr %ptr){
+; CHECK-SD-LABEL: store_v3i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    mov v0.h[1], w1
+; CHECK-SD-NEXT:    mov v0.h[2], w2
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-SD-NEXT:    str s0, [sp, #12]
+; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    strb w2, [x3, #2]
+; CHECK-SD-NEXT:    strh w8, [x3]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v3i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    strb w0, [x3]
+; CHECK-GI-NEXT:    strb w1, [x3, #1]
+; CHECK-GI-NEXT:    strb w2, [x3, #2]
+; CHECK-GI-NEXT:    ret
+    store <3 x i8> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v7i8(<7 x i8> %a, ptr %ptr){
+; CHECK-SD-LABEL: store_v7i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add x8, x0, #6
+; CHECK-SD-NEXT:    add x9, x0, #4
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    str s0, [x0]
+; CHECK-SD-NEXT:    st1 { v0.b }[6], [x8]
+; CHECK-SD-NEXT:    st1 { v0.h }[2], [x9]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v7i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add x8, x0, #1
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    add x9, x0, #2
+; CHECK-GI-NEXT:    st1 { v0.b }[0], [x0]
+; CHECK-GI-NEXT:    st1 { v0.b }[1], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #3
+; CHECK-GI-NEXT:    st1 { v0.b }[3], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #4
+; CHECK-GI-NEXT:    st1 { v0.b }[4], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #5
+; CHECK-GI-NEXT:    st1 { v0.b }[5], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #6
+; CHECK-GI-NEXT:    st1 { v0.b }[2], [x9]
+; CHECK-GI-NEXT:    st1 { v0.b }[6], [x8]
+; CHECK-GI-NEXT:    ret
+    store <7 x i8> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v3i16(<3 x i16> %a, ptr %ptr){
+; CHECK-SD-LABEL: store_v3i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add x8, x0, #4
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    str s0, [x0]
+; CHECK-SD-NEXT:    st1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v3i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x0, #4
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str h0, [x0]
+; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    st1 { v0.h }[2], [x9]
+; CHECK-GI-NEXT:    ret
+    store <3 x i16> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v7i16(<7 x i16> %a, ptr %ptr){
+; CHECK-SD-LABEL: store_v7i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add x8, x0, #12
+; CHECK-SD-NEXT:    add x9, x0, #8
+; CHECK-SD-NEXT:    str d0, [x0]
+; CHECK-SD-NEXT:    st1 { v0.h }[6], [x8]
+; CHECK-SD-NEXT:    st1 { v0.s }[2], [x9]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v7i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x0, #4
+; CHECK-GI-NEXT:    str h0, [x0]
+; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #6
+; CHECK-GI-NEXT:    st1 { v0.h }[3], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #8
+; CHECK-GI-NEXT:    st1 { v0.h }[4], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #10
+; CHECK-GI-NEXT:    st1 { v0.h }[5], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #12
+; CHECK-GI-NEXT:    st1 { v0.h }[2], [x9]
+; CHECK-GI-NEXT:    st1 { v0.h }[6], [x8]
+; CHECK-GI-NEXT:    ret
+    store <7 x i16> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v3i32(<3 x i32> %a, ptr %ptr){
+; CHECK-SD-LABEL: store_v3i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add x8, x0, #8
+; CHECK-SD-NEXT:    str d0, [x0]
+; CHECK-SD-NEXT:    st1 { v0.s }[2], [x8]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v3i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add x8, x0, #4
+; CHECK-GI-NEXT:    add x9, x0, #8
+; CHECK-GI-NEXT:    str s0, [x0]
+; CHECK-GI-NEXT:    st1 { v0.s }[1], [x8]
+; CHECK-GI-NEXT:    st1 { v0.s }[2], [x9]
+; CHECK-GI-NEXT:    ret
+    store <3 x i32> %a, ptr %ptr
+    ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
index 3965af6a9066..56b023086ea2 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
@@ -724,7 +724,7 @@ define void @verify_all_operands_are_initialised() {
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
-  call void @func_f8_and_v0_passed_via_memory(float 0.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 9.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer))
+  call void @func_f8_and_v0_passed_via_memory(float 0.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, <vscale x 4 x float> splat (float 9.000000e+00))
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sve-expand-div.ll b/llvm/test/CodeGen/AArch64/sve-expand-div.ll
index 5469c29f1aa7..fe5cdc938772 100644
--- a/llvm/test/CodeGen/AArch64/sve-expand-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-expand-div.ll
@@ -16,7 +16,7 @@ define <vscale x 16 x i8> @sdiv_i8(<vscale x 16 x i8> %a) #0 {
 ; CHECK-NEXT:    lsr z1.b, z0.b, #7
 ; CHECK-NEXT:    add z0.b, z0.b, z1.b
 ; CHECK-NEXT:    ret
-  %div = sdiv <vscale x 16 x i8> %a, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> undef, i8 3, i32 0), <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer)
+  %div = sdiv <vscale x 16 x i8> %a, splat (i8 3)
   ret <vscale x 16 x i8> %div
 }
 
@@ -30,7 +30,7 @@ define <vscale x 8 x i16> @sdiv_i16(<vscale x 8 x i16> %a) #0 {
 ; CHECK-NEXT:    lsr z1.h, z0.h, #15
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    ret
-  %div = sdiv <vscale x 8 x i16> %a, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> undef, i16 3, i32 0), <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer)
+  %div = sdiv <vscale x 8 x i16> %a, splat (i16 3)
   ret <vscale x 8 x i16> %div
 }
 
@@ -45,7 +45,7 @@ define <vscale x 4 x i32> @sdiv_i32(<vscale x 4 x i32> %a) #0 {
 ; CHECK-NEXT:    lsr z1.s, z0.s, #31
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %div = sdiv <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 3, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+  %div = sdiv <vscale x 4 x i32> %a, splat (i32 3)
   ret <vscale x 4 x i32> %div
 }
 
@@ -60,7 +60,7 @@ define <vscale x 2 x i64> @sdiv_i64(<vscale x 2 x i64> %a) #0 {
 ; CHECK-NEXT:    lsr z1.d, z0.d, #63
 ; CHECK-NEXT:    add z0.d, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %div = sdiv <vscale x 2 x i64> %a, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> undef, i64 3, i32 0), <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer)
+  %div = sdiv <vscale x 2 x i64> %a, splat (i64 3)
   ret <vscale x 2 x i64> %div
 }
 
@@ -76,7 +76,7 @@ define <vscale x 16 x i8> @udiv_i8(<vscale x 16 x i8> %a) #0 {
 ; CHECK-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    lsr z0.b, z0.b, #1
 ; CHECK-NEXT:    ret
-  %div = udiv <vscale x 16 x i8> %a, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> undef, i8 3, i32 0), <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer)
+  %div = udiv <vscale x 16 x i8> %a, splat (i8 3)
   ret <vscale x 16 x i8> %div
 }
 
@@ -89,7 +89,7 @@ define <vscale x 8 x i16> @udiv_i16(<vscale x 8 x i16> %a) #0 {
 ; CHECK-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    lsr z0.h, z0.h, #1
 ; CHECK-NEXT:    ret
-  %div = udiv <vscale x 8 x i16> %a, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> undef, i16 3, i32 0), <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer)
+  %div = udiv <vscale x 8 x i16> %a, splat (i16 3)
   ret <vscale x 8 x i16> %div
 }
 
@@ -103,7 +103,7 @@ define <vscale x 4 x i32> @udiv_i32(<vscale x 4 x i32> %a) #0 {
 ; CHECK-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    lsr z0.s, z0.s, #1
 ; CHECK-NEXT:    ret
-  %div = udiv <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 3, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+  %div = udiv <vscale x 4 x i32> %a, splat (i32 3)
   ret <vscale x 4 x i32> %div
 }
 
@@ -117,7 +117,7 @@ define <vscale x 2 x i64> @udiv_i64(<vscale x 2 x i64> %a) #0 {
 ; CHECK-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    lsr z0.d, z0.d, #1
 ; CHECK-NEXT:    ret
-  %div = udiv <vscale x 2 x i64> %a, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> undef, i64 3, i32 0), <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer)
+  %div = udiv <vscale x 2 x i64> %a, splat (i64 3)
   ret <vscale x 2 x i64> %div
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll b/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
index f6059d715a05..bdaea0ecf144 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
@@ -33,11 +33,11 @@ define i64 @scalable_int_min_max(ptr %arg, ptr %arg1, <vscale x 2 x ptr> %i37, <
 entry:
   %i56 = getelementptr inbounds float, ptr %arg, i64 0
   %i57 = load <vscale x 2 x float>, ptr %i56, align 4
-  %i58 = fmul <vscale x 2 x float> %i57, shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 0x401D41D420000000, i64 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
-  %i59 = fadd <vscale x 2 x float> %i58, shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 1.023500e+03, i64 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
+  %i58 = fmul <vscale x 2 x float> %i57, splat (float 0x401D41D420000000)
+  %i59 = fadd <vscale x 2 x float> %i58, splat (float 1.023500e+03)
   %i60 = fptosi <vscale x 2 x float> %i59 to <vscale x 2 x i32>
   %i61 = tail call <vscale x 2 x i32> @llvm.smax.nxv2i32(<vscale x 2 x i32> %i60, <vscale x 2 x i32> zeroinitializer)
-  %i62 = tail call <vscale x 2 x i32> @llvm.smin.nxv2i32(<vscale x 2 x i32> %i61, <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1023, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer))
+  %i62 = tail call <vscale x 2 x i32> @llvm.smin.nxv2i32(<vscale x 2 x i32> %i61, <vscale x 2 x i32> splat (i32 1023))
   %i63 = icmp ne <vscale x 2 x i32> %i62, zeroinitializer
   %i64 = getelementptr float, ptr %arg1, i64 0
   %i65 = tail call <vscale x 2 x float> @llvm.masked.load.nxv2f32.p0(ptr %i64, i32 4, <vscale x 2 x i1> %i63, <vscale x 2 x float> poison)
diff --git a/llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll b/llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll
index f7a963186c13..a40d55085279 100644
--- a/llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll
+++ b/llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll
@@ -95,7 +95,7 @@ define <vscale x 16 x i8> @narrow_i64_gather_index_i8_zext(ptr %out, ptr %in, <v
   %wide.load = load <vscale x 16 x i8>, ptr %2, align 1
   %3 = zext <vscale x 16 x i8> %wide.load to <vscale x 16 x i64>
   %4 = getelementptr inbounds i8, ptr %in, <vscale x 16 x i64> %3
-  %wide.masked.gather = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> %4, i32 1, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i32 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i8> undef)
+  %wide.masked.gather = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> %4, i32 1, <vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> undef)
   ret <vscale x 16 x i8> %wide.masked.gather
 }
 
@@ -121,7 +121,7 @@ define <vscale x 16 x i8> @narrow_i64_gather_index_i8_sext(ptr %out, ptr %in, <v
   %wide.load = load <vscale x 16 x i8>, ptr %2, align 1
   %3 = sext <vscale x 16 x i8> %wide.load to <vscale x 16 x i64>
   %4 = getelementptr inbounds i8, ptr %in, <vscale x 16 x i64> %3
-  %wide.masked.gather = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> %4, i32 1, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i32 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i8> undef)
+  %wide.masked.gather = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> %4, i32 1, <vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> undef)
   ret <vscale x 16 x i8> %wide.masked.gather
 }
 
@@ -141,7 +141,7 @@ define <vscale x 8 x i16> @narrow_i64_gather_index_i16_zext(ptr %out, ptr %in, <
   %wide.load = load <vscale x 8 x i16>, ptr %2, align 1
   %3 = zext <vscale x 8 x i16> %wide.load to <vscale x 8 x i64>
   %4 = getelementptr inbounds i16, ptr %in, <vscale x 8 x i64> %3
-  %wide.masked.gather = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> %4, i32 1, <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i32 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i16> undef)
+  %wide.masked.gather = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> %4, i32 1, <vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> undef)
   ret <vscale x 8 x i16> %wide.masked.gather
 }
 
@@ -161,7 +161,7 @@ define <vscale x 8 x i16> @narrow_i64_gather_index_i16_sext(ptr %out, ptr %in, <
   %wide.load = load <vscale x 8 x i16>, ptr %2, align 1
   %3 = sext <vscale x 8 x i16> %wide.load to <vscale x 8 x i64>
   %4 = getelementptr inbounds i16, ptr %in, <vscale x 8 x i64> %3
-  %wide.masked.gather = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> %4, i32 1, <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i32 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i16> undef)
+  %wide.masked.gather = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> %4, i32 1, <vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> undef)
   ret <vscale x 8 x i16> %wide.masked.gather
 }
 
@@ -177,7 +177,7 @@ define <vscale x 4 x i32> @no_narrow_i64_gather_index_i32(ptr %out, ptr %in, <vs
   %wide.load = load <vscale x 4 x i32>, ptr %2, align 1
   %3 = zext <vscale x 4 x i32> %wide.load to <vscale x 4 x i64>
   %4 = getelementptr inbounds i32, ptr %in, <vscale x 4 x i64> %3
-  %wide.masked.gather = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> %4, i32 1, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> undef)
+  %wide.masked.gather = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> %4, i32 1, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> undef)
   ret <vscale x 4 x i32> %wide.masked.gather
 }
 
@@ -192,7 +192,7 @@ define <vscale x 2 x i64> @no_narrow_i64_gather_index_i64(ptr %out, ptr %in, <vs
   %2 = bitcast ptr %1 to ptr
   %wide.load = load <vscale x 2 x i64>, ptr %2, align 1
   %3 = getelementptr inbounds i64, ptr %in, <vscale x 2 x i64> %wide.load
-  %wide.masked.gather = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> %3, i32 1, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> undef)
+  %wide.masked.gather = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> %3, i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> undef)
   ret <vscale x 2 x i64> %wide.masked.gather
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sve-hadd.ll b/llvm/test/CodeGen/AArch64/sve-hadd.ll
index 7936094af1c0..c73370d50287 100644
--- a/llvm/test/CodeGen/AArch64/sve-hadd.ll
+++ b/llvm/test/CodeGen/AArch64/sve-hadd.ll
@@ -22,7 +22,7 @@ entry:
   %s0s = sext <vscale x 2 x i64> %s0 to <vscale x 2 x i128>
   %s1s = sext <vscale x 2 x i64> %s1 to <vscale x 2 x i128>
   %m = add nsw <vscale x 2 x i128> %s0s, %s1s
-  %s = ashr <vscale x 2 x i128> %m, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = ashr <vscale x 2 x i128> %m, splat (i128 1)
   %s2 = trunc <vscale x 2 x i128> %s to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %s2
 }
@@ -47,7 +47,7 @@ entry:
   %s0s = sext <vscale x 2 x i64> %s0 to <vscale x 2 x i128>
   %s1s = sext <vscale x 2 x i64> %s1 to <vscale x 2 x i128>
   %m = add nsw <vscale x 2 x i128> %s0s, %s1s
-  %s = lshr <vscale x 2 x i128> %m, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i128> %m, splat (i128 1)
   %s2 = trunc <vscale x 2 x i128> %s to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %s2
 }
@@ -72,7 +72,7 @@ entry:
   %s0s = zext <vscale x 2 x i64> %s0 to <vscale x 2 x i128>
   %s1s = zext <vscale x 2 x i64> %s1 to <vscale x 2 x i128>
   %m = add nuw nsw <vscale x 2 x i128> %s0s, %s1s
-  %s = lshr <vscale x 2 x i128> %m, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i128> %m, splat (i128 1)
   %s2 = trunc <vscale x 2 x i128> %s to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %s2
 }
@@ -97,7 +97,7 @@ entry:
   %s0s = sext <vscale x 2 x i32> %s0 to <vscale x 2 x i64>
   %s1s = sext <vscale x 2 x i32> %s1 to <vscale x 2 x i64>
   %m = add nsw <vscale x 2 x i64> %s0s, %s1s
-  %s = ashr <vscale x 2 x i64> %m, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = ashr <vscale x 2 x i64> %m, splat (i64 1)
   %s2 = trunc <vscale x 2 x i64> %s to <vscale x 2 x i32>
   ret <vscale x 2 x i32> %s2
 }
@@ -114,7 +114,7 @@ entry:
   %s0s = sext <vscale x 2 x i32> %s0 to <vscale x 2 x i64>
   %s1s = sext <vscale x 2 x i32> %s1 to <vscale x 2 x i64>
   %m = add nsw <vscale x 2 x i64> %s0s, %s1s
-  %s = lshr <vscale x 2 x i64> %m, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i64> %m, splat (i64 1)
   %s2 = trunc <vscale x 2 x i64> %s to <vscale x 2 x i32>
   ret <vscale x 2 x i32> %s2
 }
@@ -138,7 +138,7 @@ entry:
   %s0s = zext <vscale x 2 x i32> %s0 to <vscale x 2 x i64>
   %s1s = zext <vscale x 2 x i32> %s1 to <vscale x 2 x i64>
   %m = add nuw nsw <vscale x 2 x i64> %s0s, %s1s
-  %s = lshr <vscale x 2 x i64> %m, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i64> %m, splat (i64 1)
   %s2 = trunc <vscale x 2 x i64> %s to <vscale x 2 x i32>
   ret <vscale x 2 x i32> %s2
 }
@@ -163,7 +163,7 @@ entry:
   %s0s = sext <vscale x 4 x i32> %s0 to <vscale x 4 x i64>
   %s1s = sext <vscale x 4 x i32> %s1 to <vscale x 4 x i64>
   %m = add nsw <vscale x 4 x i64> %s0s, %s1s
-  %s = ashr <vscale x 4 x i64> %m, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = ashr <vscale x 4 x i64> %m, splat (i64 1)
   %s2 = trunc <vscale x 4 x i64> %s to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %s2
 }
@@ -188,7 +188,7 @@ entry:
   %s0s = sext <vscale x 4 x i32> %s0 to <vscale x 4 x i64>
   %s1s = sext <vscale x 4 x i32> %s1 to <vscale x 4 x i64>
   %m = add nsw <vscale x 4 x i64> %s0s, %s1s
-  %s = lshr <vscale x 4 x i64> %m, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = lshr <vscale x 4 x i64> %m, splat (i64 1)
   %s2 = trunc <vscale x 4 x i64> %s to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %s2
 }
@@ -213,7 +213,7 @@ entry:
   %s0s = zext <vscale x 4 x i32> %s0 to <vscale x 4 x i64>
   %s1s = zext <vscale x 4 x i32> %s1 to <vscale x 4 x i64>
   %m = add nuw nsw <vscale x 4 x i64> %s0s, %s1s
-  %s = lshr <vscale x 4 x i64> %m, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = lshr <vscale x 4 x i64> %m, splat (i64 1)
   %s2 = trunc <vscale x 4 x i64> %s to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %s2
 }
@@ -239,7 +239,7 @@ entry:
   %s0s = sext <vscale x 2 x i16> %s0 to <vscale x 2 x i32>
   %s1s = sext <vscale x 2 x i16> %s1 to <vscale x 2 x i32>
   %m = add nsw <vscale x 2 x i32> %s0s, %s1s
-  %s = ashr <vscale x 2 x i32> %m, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = ashr <vscale x 2 x i32> %m, splat (i32 1)
   %s2 = trunc <vscale x 2 x i32> %s to <vscale x 2 x i16>
   ret <vscale x 2 x i16> %s2
 }
@@ -258,7 +258,7 @@ entry:
   %s0s = sext <vscale x 2 x i16> %s0 to <vscale x 2 x i32>
   %s1s = sext <vscale x 2 x i16> %s1 to <vscale x 2 x i32>
   %m = add nsw <vscale x 2 x i32> %s0s, %s1s
-  %s = lshr <vscale x 2 x i32> %m, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i32> %m, splat (i32 1)
   %s2 = trunc <vscale x 2 x i32> %s to <vscale x 2 x i16>
   ret <vscale x 2 x i16> %s2
 }
@@ -283,7 +283,7 @@ entry:
   %s0s = zext <vscale x 2 x i16> %s0 to <vscale x 2 x i32>
   %s1s = zext <vscale x 2 x i16> %s1 to <vscale x 2 x i32>
   %m = add nuw nsw <vscale x 2 x i32> %s0s, %s1s
-  %s = lshr <vscale x 2 x i32> %m, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i32> %m, splat (i32 1)
   %s2 = trunc <vscale x 2 x i32> %s to <vscale x 2 x i16>
   ret <vscale x 2 x i16> %s2
 }
@@ -309,7 +309,7 @@ entry:
   %s0s = sext <vscale x 4 x i16> %s0 to <vscale x 4 x i32>
   %s1s = sext <vscale x 4 x i16> %s1 to <vscale x 4 x i32>
   %m = add nsw <vscale x 4 x i32> %s0s, %s1s
-  %s = ashr <vscale x 4 x i32> %m, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = ashr <vscale x 4 x i32> %m, splat (i32 1)
   %s2 = trunc <vscale x 4 x i32> %s to <vscale x 4 x i16>
   ret <vscale x 4 x i16> %s2
 }
@@ -327,7 +327,7 @@ entry:
   %s0s = sext <vscale x 4 x i16> %s0 to <vscale x 4 x i32>
   %s1s = sext <vscale x 4 x i16> %s1 to <vscale x 4 x i32>
   %m = add nsw <vscale x 4 x i32> %s0s, %s1s
-  %s = lshr <vscale x 4 x i32> %m, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = lshr <vscale x 4 x i32> %m, splat (i32 1)
   %s2 = trunc <vscale x 4 x i32> %s to <vscale x 4 x i16>
   ret <vscale x 4 x i16> %s2
 }
@@ -352,7 +352,7 @@ entry:
   %s0s = zext <vscale x 4 x i16> %s0 to <vscale x 4 x i32>
   %s1s = zext <vscale x 4 x i16> %s1 to <vscale x 4 x i32>
   %m = add nuw nsw <vscale x 4 x i32> %s0s, %s1s
-  %s = lshr <vscale x 4 x i32> %m, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = lshr <vscale x 4 x i32> %m, splat (i32 1)
   %s2 = trunc <vscale x 4 x i32> %s to <vscale x 4 x i16>
   ret <vscale x 4 x i16> %s2
 }
@@ -377,7 +377,7 @@ entry:
   %s0s = sext <vscale x 8 x i16> %s0 to <vscale x 8 x i32>
   %s1s = sext <vscale x 8 x i16> %s1 to <vscale x 8 x i32>
   %m = add nsw <vscale x 8 x i32> %s0s, %s1s
-  %s = ashr <vscale x 8 x i32> %m, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %s = ashr <vscale x 8 x i32> %m, splat (i32 1)
   %s2 = trunc <vscale x 8 x i32> %s to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %s2
 }
@@ -402,7 +402,7 @@ entry:
   %s0s = sext <vscale x 8 x i16> %s0 to <vscale x 8 x i32>
   %s1s = sext <vscale x 8 x i16> %s1 to <vscale x 8 x i32>
   %m = add nsw <vscale x 8 x i32> %s0s, %s1s
-  %s = lshr <vscale x 8 x i32> %m, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %s = lshr <vscale x 8 x i32> %m, splat (i32 1)
   %s2 = trunc <vscale x 8 x i32> %s to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %s2
 }
@@ -427,7 +427,7 @@ entry:
   %s0s = zext <vscale x 8 x i16> %s0 to <vscale x 8 x i32>
   %s1s = zext <vscale x 8 x i16> %s1 to <vscale x 8 x i32>
   %m = add nuw nsw <vscale x 8 x i32> %s0s, %s1s
-  %s = lshr <vscale x 8 x i32> %m, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %s = lshr <vscale x 8 x i32> %m, splat (i32 1)
   %s2 = trunc <vscale x 8 x i32> %s to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %s2
 }
@@ -453,7 +453,7 @@ entry:
   %s0s = sext <vscale x 4 x i8> %s0 to <vscale x 4 x i16>
   %s1s = sext <vscale x 4 x i8> %s1 to <vscale x 4 x i16>
   %m = add nsw <vscale x 4 x i16> %s0s, %s1s
-  %s = ashr <vscale x 4 x i16> %m, shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> poison, i16 1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = ashr <vscale x 4 x i16> %m, splat (i16 1)
   %s2 = trunc <vscale x 4 x i16> %s to <vscale x 4 x i8>
   ret <vscale x 4 x i8> %s2
 }
@@ -472,7 +472,7 @@ entry:
   %s0s = sext <vscale x 4 x i8> %s0 to <vscale x 4 x i16>
   %s1s = sext <vscale x 4 x i8> %s1 to <vscale x 4 x i16>
   %m = add nsw <vscale x 4 x i16> %s0s, %s1s
-  %s = lshr <vscale x 4 x i16> %m, shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> poison, i16 1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = lshr <vscale x 4 x i16> %m, splat (i16 1)
   %s2 = trunc <vscale x 4 x i16> %s to <vscale x 4 x i8>
   ret <vscale x 4 x i8> %s2
 }
@@ -497,7 +497,7 @@ entry:
   %s0s = zext <vscale x 4 x i8> %s0 to <vscale x 4 x i16>
   %s1s = zext <vscale x 4 x i8> %s1 to <vscale x 4 x i16>
   %m = add nuw nsw <vscale x 4 x i16> %s0s, %s1s
-  %s = lshr <vscale x 4 x i16> %m, shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> poison, i16 1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = lshr <vscale x 4 x i16> %m, splat (i16 1)
   %s2 = trunc <vscale x 4 x i16> %s to <vscale x 4 x i8>
   ret <vscale x 4 x i8> %s2
 }
@@ -523,7 +523,7 @@ entry:
   %s0s = sext <vscale x 8 x i8> %s0 to <vscale x 8 x i16>
   %s1s = sext <vscale x 8 x i8> %s1 to <vscale x 8 x i16>
   %m = add nsw <vscale x 8 x i16> %s0s, %s1s
-  %s = ashr <vscale x 8 x i16> %m, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %s = ashr <vscale x 8 x i16> %m, splat (i16 1)
   %s2 = trunc <vscale x 8 x i16> %s to <vscale x 8 x i8>
   ret <vscale x 8 x i8> %s2
 }
@@ -541,7 +541,7 @@ entry:
   %s0s = sext <vscale x 8 x i8> %s0 to <vscale x 8 x i16>
   %s1s = sext <vscale x 8 x i8> %s1 to <vscale x 8 x i16>
   %m = add nsw <vscale x 8 x i16> %s0s, %s1s
-  %s = lshr <vscale x 8 x i16> %m, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %s = lshr <vscale x 8 x i16> %m, splat (i16 1)
   %s2 = trunc <vscale x 8 x i16> %s to <vscale x 8 x i8>
   ret <vscale x 8 x i8> %s2
 }
@@ -566,7 +566,7 @@ entry:
   %s0s = zext <vscale x 8 x i8> %s0 to <vscale x 8 x i16>
   %s1s = zext <vscale x 8 x i8> %s1 to <vscale x 8 x i16>
   %m = add nuw nsw <vscale x 8 x i16> %s0s, %s1s
-  %s = lshr <vscale x 8 x i16> %m, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %s = lshr <vscale x 8 x i16> %m, splat (i16 1)
   %s2 = trunc <vscale x 8 x i16> %s to <vscale x 8 x i8>
   ret <vscale x 8 x i8> %s2
 }
@@ -591,7 +591,7 @@ entry:
   %s0s = sext <vscale x 16 x i8> %s0 to <vscale x 16 x i16>
   %s1s = sext <vscale x 16 x i8> %s1 to <vscale x 16 x i16>
   %m = add nsw <vscale x 16 x i16> %s0s, %s1s
-  %s = ashr <vscale x 16 x i16> %m, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %s = ashr <vscale x 16 x i16> %m, splat (i16 1)
   %s2 = trunc <vscale x 16 x i16> %s to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %s2
 }
@@ -616,7 +616,7 @@ entry:
   %s0s = sext <vscale x 16 x i8> %s0 to <vscale x 16 x i16>
   %s1s = sext <vscale x 16 x i8> %s1 to <vscale x 16 x i16>
   %m = add nsw <vscale x 16 x i16> %s0s, %s1s
-  %s = lshr <vscale x 16 x i16> %m, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %s = lshr <vscale x 16 x i16> %m, splat (i16 1)
   %s2 = trunc <vscale x 16 x i16> %s to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %s2
 }
@@ -641,7 +641,7 @@ entry:
   %s0s = zext <vscale x 16 x i8> %s0 to <vscale x 16 x i16>
   %s1s = zext <vscale x 16 x i8> %s1 to <vscale x 16 x i16>
   %m = add nuw nsw <vscale x 16 x i16> %s0s, %s1s
-  %s = lshr <vscale x 16 x i16> %m, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %s = lshr <vscale x 16 x i16> %m, splat (i16 1)
   %s2 = trunc <vscale x 16 x i16> %s to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %s2
 }
@@ -665,9 +665,9 @@ define <vscale x 2 x i64> @rhadds_v2i64(<vscale x 2 x i64> %s0, <vscale x 2 x i6
 entry:
   %s0s = sext <vscale x 2 x i64> %s0 to <vscale x 2 x i128>
   %s1s = sext <vscale x 2 x i64> %s1 to <vscale x 2 x i128>
-  %add = add <vscale x 2 x i128> %s0s, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+  %add = add <vscale x 2 x i128> %s0s, splat (i128 1)
   %add2 = add <vscale x 2 x i128> %add, %s1s
-  %s = ashr <vscale x 2 x i128> %add2, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = ashr <vscale x 2 x i128> %add2, splat (i128 1)
   %result = trunc <vscale x 2 x i128> %s to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %result
 }
@@ -691,9 +691,9 @@ define <vscale x 2 x i64> @rhadds_v2i64_lsh(<vscale x 2 x i64> %s0, <vscale x 2
 entry:
   %s0s = sext <vscale x 2 x i64> %s0 to <vscale x 2 x i128>
   %s1s = sext <vscale x 2 x i64> %s1 to <vscale x 2 x i128>
-  %add = add <vscale x 2 x i128> %s0s, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+  %add = add <vscale x 2 x i128> %s0s, splat (i128 1)
   %add2 = add <vscale x 2 x i128> %add, %s1s
-  %s = lshr <vscale x 2 x i128> %add2, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i128> %add2, splat (i128 1)
   %result = trunc <vscale x 2 x i128> %s to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %result
 }
@@ -717,9 +717,9 @@ define <vscale x 2 x i64> @rhaddu_v2i64(<vscale x 2 x i64> %s0, <vscale x 2 x i6
 entry:
   %s0s = zext <vscale x 2 x i64> %s0 to <vscale x 2 x i128>
   %s1s = zext <vscale x 2 x i64> %s1 to <vscale x 2 x i128>
-  %add = add nuw nsw <vscale x 2 x i128> %s0s, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+  %add = add nuw nsw <vscale x 2 x i128> %s0s, splat (i128 1)
   %add2 = add nuw nsw <vscale x 2 x i128> %add, %s1s
-  %s = lshr <vscale x 2 x i128> %add2, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i128> %add2, splat (i128 1)
   %result = trunc <vscale x 2 x i128> %s to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %result
 }
@@ -746,9 +746,9 @@ define <vscale x 2 x i32> @rhadds_v2i32(<vscale x 2 x i32> %s0, <vscale x 2 x i3
 entry:
   %s0s = sext <vscale x 2 x i32> %s0 to <vscale x 2 x i64>
   %s1s = sext <vscale x 2 x i32> %s1 to <vscale x 2 x i64>
-  %add = add <vscale x 2 x i64> %s0s, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %add = add <vscale x 2 x i64> %s0s, splat (i64 1)
   %add2 = add <vscale x 2 x i64> %add, %s1s
-  %s = ashr <vscale x 2 x i64> %add2, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = ashr <vscale x 2 x i64> %add2, splat (i64 1)
   %result = trunc <vscale x 2 x i64> %s to <vscale x 2 x i32>
   ret <vscale x 2 x i32> %result
 }
@@ -767,9 +767,9 @@ define <vscale x 2 x i32> @rhadds_v2i32_lsh(<vscale x 2 x i32> %s0, <vscale x 2
 entry:
   %s0s = sext <vscale x 2 x i32> %s0 to <vscale x 2 x i64>
   %s1s = sext <vscale x 2 x i32> %s1 to <vscale x 2 x i64>
-  %add = add <vscale x 2 x i64> %s0s, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %add = add <vscale x 2 x i64> %s0s, splat (i64 1)
   %add2 = add <vscale x 2 x i64> %add, %s1s
-  %s = lshr <vscale x 2 x i64> %add2, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i64> %add2, splat (i64 1)
   %result = trunc <vscale x 2 x i64> %s to <vscale x 2 x i32>
   ret <vscale x 2 x i32> %result
 }
@@ -795,9 +795,9 @@ define <vscale x 2 x i32> @rhaddu_v2i32(<vscale x 2 x i32> %s0, <vscale x 2 x i3
 entry:
   %s0s = zext <vscale x 2 x i32> %s0 to <vscale x 2 x i64>
   %s1s = zext <vscale x 2 x i32> %s1 to <vscale x 2 x i64>
-  %add = add nuw nsw <vscale x 2 x i64> %s0s, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %add = add nuw nsw <vscale x 2 x i64> %s0s, splat (i64 1)
   %add2 = add nuw nsw <vscale x 2 x i64> %add, %s1s
-  %s = lshr <vscale x 2 x i64> %add2, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i64> %add2, splat (i64 1)
   %result = trunc <vscale x 2 x i64> %s to <vscale x 2 x i32>
   ret <vscale x 2 x i32> %result
 }
@@ -821,9 +821,9 @@ define <vscale x 4 x i32> @rhadds_v4i32(<vscale x 4 x i32> %s0, <vscale x 4 x i3
 entry:
   %s0s = sext <vscale x 4 x i32> %s0 to <vscale x 4 x i64>
   %s1s = sext <vscale x 4 x i32> %s1 to <vscale x 4 x i64>
-  %add = add <vscale x 4 x i64> %s0s, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+  %add = add <vscale x 4 x i64> %s0s, splat (i64 1)
   %add2 = add <vscale x 4 x i64> %add, %s1s
-  %s = ashr <vscale x 4 x i64> %add2, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = ashr <vscale x 4 x i64> %add2, splat (i64 1)
   %result = trunc <vscale x 4 x i64> %s to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %result
 }
@@ -847,9 +847,9 @@ define <vscale x 4 x i32> @rhadds_v4i32_lsh(<vscale x 4 x i32> %s0, <vscale x 4
 entry:
   %s0s = sext <vscale x 4 x i32> %s0 to <vscale x 4 x i64>
   %s1s = sext <vscale x 4 x i32> %s1 to <vscale x 4 x i64>
-  %add = add <vscale x 4 x i64> %s0s, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+  %add = add <vscale x 4 x i64> %s0s, splat (i64 1)
   %add2 = add <vscale x 4 x i64> %add, %s1s
-  %s = lshr <vscale x 4 x i64> %add2, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = lshr <vscale x 4 x i64> %add2, splat (i64 1)
   %result = trunc <vscale x 4 x i64> %s to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %result
 }
@@ -873,9 +873,9 @@ define <vscale x 4 x i32> @rhaddu_v4i32(<vscale x 4 x i32> %s0, <vscale x 4 x i3
 entry:
   %s0s = zext <vscale x 4 x i32> %s0 to <vscale x 4 x i64>
   %s1s = zext <vscale x 4 x i32> %s1 to <vscale x 4 x i64>
-  %add = add nuw nsw <vscale x 4 x i64> %s0s, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+  %add = add nuw nsw <vscale x 4 x i64> %s0s, splat (i64 1)
   %add2 = add nuw nsw <vscale x 4 x i64> %add, %s1s
-  %s = lshr <vscale x 4 x i64> %add2, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = lshr <vscale x 4 x i64> %add2, splat (i64 1)
   %result = trunc <vscale x 4 x i64> %s to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %result
 }
@@ -894,9 +894,9 @@ define <vscale x 2 x i16> @rhadds_v2i16(<vscale x 2 x i16> %s0, <vscale x 2 x i1
 entry:
   %s0s = sext <vscale x 2 x i16> %s0 to <vscale x 2 x i32>
   %s1s = sext <vscale x 2 x i16> %s1 to <vscale x 2 x i32>
-  %add = add <vscale x 2 x i32> %s0s, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %add = add <vscale x 2 x i32> %s0s, splat (i32 1)
   %add2 = add <vscale x 2 x i32> %add, %s1s
-  %s = ashr <vscale x 2 x i32> %add2, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = ashr <vscale x 2 x i32> %add2, splat (i32 1)
   %result = trunc <vscale x 2 x i32> %s to <vscale x 2 x i16>
   ret <vscale x 2 x i16> %result
 }
@@ -916,9 +916,9 @@ define <vscale x 2 x i16> @rhadds_v2i16_lsh(<vscale x 2 x i16> %s0, <vscale x 2
 entry:
   %s0s = sext <vscale x 2 x i16> %s0 to <vscale x 2 x i32>
   %s1s = sext <vscale x 2 x i16> %s1 to <vscale x 2 x i32>
-  %add = add <vscale x 2 x i32> %s0s, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %add = add <vscale x 2 x i32> %s0s, splat (i32 1)
   %add2 = add <vscale x 2 x i32> %add, %s1s
-  %s = lshr <vscale x 2 x i32> %add2, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i32> %add2, splat (i32 1)
   %result = trunc <vscale x 2 x i32> %s to <vscale x 2 x i16>
   ret <vscale x 2 x i16> %result
 }
@@ -944,9 +944,9 @@ define <vscale x 2 x i16> @rhaddu_v2i16(<vscale x 2 x i16> %s0, <vscale x 2 x i1
 entry:
   %s0s = zext <vscale x 2 x i16> %s0 to <vscale x 2 x i32>
   %s1s = zext <vscale x 2 x i16> %s1 to <vscale x 2 x i32>
-  %add = add nuw nsw <vscale x 2 x i32> %s0s, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %add = add nuw nsw <vscale x 2 x i32> %s0s, splat (i32 1)
   %add2 = add nuw nsw <vscale x 2 x i32> %add, %s1s
-  %s = lshr <vscale x 2 x i32> %add2, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i32> %add2, splat (i32 1)
   %result = trunc <vscale x 2 x i32> %s to <vscale x 2 x i16>
   ret <vscale x 2 x i16> %result
 }
@@ -973,9 +973,9 @@ define <vscale x 4 x i16> @rhadds_v4i16(<vscale x 4 x i16> %s0, <vscale x 4 x i1
 entry:
   %s0s = sext <vscale x 4 x i16> %s0 to <vscale x 4 x i32>
   %s1s = sext <vscale x 4 x i16> %s1 to <vscale x 4 x i32>
-  %add = add <vscale x 4 x i32> %s0s, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %add = add <vscale x 4 x i32> %s0s, splat (i32 1)
   %add2 = add <vscale x 4 x i32> %add, %s1s
-  %s = ashr <vscale x 4 x i32> %add2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = ashr <vscale x 4 x i32> %add2, splat (i32 1)
   %result = trunc <vscale x 4 x i32> %s to <vscale x 4 x i16>
   ret <vscale x 4 x i16> %result
 }
@@ -994,9 +994,9 @@ define <vscale x 4 x i16> @rhadds_v4i16_lsh(<vscale x 4 x i16> %s0, <vscale x 4
 entry:
   %s0s = sext <vscale x 4 x i16> %s0 to <vscale x 4 x i32>
   %s1s = sext <vscale x 4 x i16> %s1 to <vscale x 4 x i32>
-  %add = add <vscale x 4 x i32> %s0s, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %add = add <vscale x 4 x i32> %s0s, splat (i32 1)
   %add2 = add <vscale x 4 x i32> %add, %s1s
-  %s = lshr <vscale x 4 x i32> %add2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = lshr <vscale x 4 x i32> %add2, splat (i32 1)
   %result = trunc <vscale x 4 x i32> %s to <vscale x 4 x i16>
   ret <vscale x 4 x i16> %result
 }
@@ -1022,9 +1022,9 @@ define <vscale x 4 x i16> @rhaddu_v4i16(<vscale x 4 x i16> %s0, <vscale x 4 x i1
 entry:
   %s0s = zext <vscale x 4 x i16> %s0 to <vscale x 4 x i32>
   %s1s = zext <vscale x 4 x i16> %s1 to <vscale x 4 x i32>
-  %add = add nuw nsw <vscale x 4 x i32> %s0s, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %add = add nuw nsw <vscale x 4 x i32> %s0s, splat (i32 1)
   %add2 = add nuw nsw <vscale x 4 x i32> %add, %s1s
-  %s = lshr <vscale x 4 x i32> %add2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = lshr <vscale x 4 x i32> %add2, splat (i32 1)
   %result = trunc <vscale x 4 x i32> %s to <vscale x 4 x i16>
   ret <vscale x 4 x i16> %result
 }
@@ -1048,9 +1048,9 @@ define <vscale x 8 x i16> @rhadds_v8i16(<vscale x 8 x i16> %s0, <vscale x 8 x i1
 entry:
   %s0s = sext <vscale x 8 x i16> %s0 to <vscale x 8 x i32>
   %s1s = sext <vscale x 8 x i16> %s1 to <vscale x 8 x i32>
-  %add = add <vscale x 8 x i32> %s0s, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %add = add <vscale x 8 x i32> %s0s, splat (i32 1)
   %add2 = add <vscale x 8 x i32> %add, %s1s
-  %s = ashr <vscale x 8 x i32> %add2, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %s = ashr <vscale x 8 x i32> %add2, splat (i32 1)
   %result = trunc <vscale x 8 x i32> %s to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %result
 }
@@ -1074,9 +1074,9 @@ define <vscale x 8 x i16> @rhadds_v8i16_lsh(<vscale x 8 x i16> %s0, <vscale x 8
 entry:
   %s0s = sext <vscale x 8 x i16> %s0 to <vscale x 8 x i32>
   %s1s = sext <vscale x 8 x i16> %s1 to <vscale x 8 x i32>
-  %add = add <vscale x 8 x i32> %s0s, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %add = add <vscale x 8 x i32> %s0s, splat (i32 1)
   %add2 = add <vscale x 8 x i32> %add, %s1s
-  %s = lshr <vscale x 8 x i32> %add2, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %s = lshr <vscale x 8 x i32> %add2, splat (i32 1)
   %result = trunc <vscale x 8 x i32> %s to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %result
 }
@@ -1100,9 +1100,9 @@ define <vscale x 8 x i16> @rhaddu_v8i16(<vscale x 8 x i16> %s0, <vscale x 8 x i1
 entry:
   %s0s = zext <vscale x 8 x i16> %s0 to <vscale x 8 x i32>
   %s1s = zext <vscale x 8 x i16> %s1 to <vscale x 8 x i32>
-  %add = add nuw nsw <vscale x 8 x i32> %s0s, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %add = add nuw nsw <vscale x 8 x i32> %s0s, splat (i32 1)
   %add2 = add nuw nsw <vscale x 8 x i32> %add, %s1s
-  %s = lshr <vscale x 8 x i32> %add2, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %s = lshr <vscale x 8 x i32> %add2, splat (i32 1)
   %result = trunc <vscale x 8 x i32> %s to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %result
 }
@@ -1121,9 +1121,9 @@ define <vscale x 4 x i8> @rhadds_v4i8(<vscale x 4 x i8> %s0, <vscale x 4 x i8> %
 entry:
   %s0s = sext <vscale x 4 x i8> %s0 to <vscale x 4 x i16>
   %s1s = sext <vscale x 4 x i8> %s1 to <vscale x 4 x i16>
-  %add = add <vscale x 4 x i16> %s0s, shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> poison, i16 1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer)
+  %add = add <vscale x 4 x i16> %s0s, splat (i16 1)
   %add2 = add <vscale x 4 x i16> %add, %s1s
-  %s = ashr <vscale x 4 x i16> %add2, shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> poison, i16 1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = ashr <vscale x 4 x i16> %add2, splat (i16 1)
   %result = trunc <vscale x 4 x i16> %s to <vscale x 4 x i8>
   ret <vscale x 4 x i8> %result
 }
@@ -1143,9 +1143,9 @@ define <vscale x 4 x i8> @rhadds_v4i8_lsh(<vscale x 4 x i8> %s0, <vscale x 4 x i
 entry:
   %s0s = sext <vscale x 4 x i8> %s0 to <vscale x 4 x i16>
   %s1s = sext <vscale x 4 x i8> %s1 to <vscale x 4 x i16>
-  %add = add <vscale x 4 x i16> %s0s, shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> poison, i16 1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer)
+  %add = add <vscale x 4 x i16> %s0s, splat (i16 1)
   %add2 = add <vscale x 4 x i16> %add, %s1s
-  %s = lshr <vscale x 4 x i16> %add2, shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> poison, i16 1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = lshr <vscale x 4 x i16> %add2, splat (i16 1)
   %result = trunc <vscale x 4 x i16> %s to <vscale x 4 x i8>
   ret <vscale x 4 x i8> %result
 }
@@ -1171,9 +1171,9 @@ define <vscale x 4 x i8> @rhaddu_v4i8(<vscale x 4 x i8> %s0, <vscale x 4 x i8> %
 entry:
   %s0s = zext <vscale x 4 x i8> %s0 to <vscale x 4 x i16>
   %s1s = zext <vscale x 4 x i8> %s1 to <vscale x 4 x i16>
-  %add = add nuw nsw <vscale x 4 x i16> %s0s, shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> poison, i16 1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer)
+  %add = add nuw nsw <vscale x 4 x i16> %s0s, splat (i16 1)
   %add2 = add nuw nsw <vscale x 4 x i16> %add, %s1s
-  %s = lshr <vscale x 4 x i16> %add2, shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> poison, i16 1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = lshr <vscale x 4 x i16> %add2, splat (i16 1)
   %result = trunc <vscale x 4 x i16> %s to <vscale x 4 x i8>
   ret <vscale x 4 x i8> %result
 }
@@ -1200,9 +1200,9 @@ define <vscale x 8 x i8> @rhadds_v8i8(<vscale x 8 x i8> %s0, <vscale x 8 x i8> %
 entry:
   %s0s = sext <vscale x 8 x i8> %s0 to <vscale x 8 x i16>
   %s1s = sext <vscale x 8 x i8> %s1 to <vscale x 8 x i16>
-  %add = add <vscale x 8 x i16> %s0s, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %add = add <vscale x 8 x i16> %s0s, splat (i16 1)
   %add2 = add <vscale x 8 x i16> %add, %s1s
-  %s = ashr <vscale x 8 x i16> %add2, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %s = ashr <vscale x 8 x i16> %add2, splat (i16 1)
   %result = trunc <vscale x 8 x i16> %s to <vscale x 8 x i8>
   ret <vscale x 8 x i8> %result
 }
@@ -1221,9 +1221,9 @@ define <vscale x 8 x i8> @rhadds_v8i8_lsh(<vscale x 8 x i8> %s0, <vscale x 8 x i
 entry:
   %s0s = sext <vscale x 8 x i8> %s0 to <vscale x 8 x i16>
   %s1s = sext <vscale x 8 x i8> %s1 to <vscale x 8 x i16>
-  %add = add <vscale x 8 x i16> %s0s, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %add = add <vscale x 8 x i16> %s0s, splat (i16 1)
   %add2 = add <vscale x 8 x i16> %add, %s1s
-  %s = lshr <vscale x 8 x i16> %add2, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %s = lshr <vscale x 8 x i16> %add2, splat (i16 1)
   %result = trunc <vscale x 8 x i16> %s to <vscale x 8 x i8>
   ret <vscale x 8 x i8> %result
 }
@@ -1249,9 +1249,9 @@ define <vscale x 8 x i8> @rhaddu_v8i8(<vscale x 8 x i8> %s0, <vscale x 8 x i8> %
 entry:
   %s0s = zext <vscale x 8 x i8> %s0 to <vscale x 8 x i16>
   %s1s = zext <vscale x 8 x i8> %s1 to <vscale x 8 x i16>
-  %add = add nuw nsw <vscale x 8 x i16> %s0s, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %add = add nuw nsw <vscale x 8 x i16> %s0s, splat (i16 1)
   %add2 = add nuw nsw <vscale x 8 x i16> %add, %s1s
-  %s = lshr <vscale x 8 x i16> %add2, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %s = lshr <vscale x 8 x i16> %add2, splat (i16 1)
   %result = trunc <vscale x 8 x i16> %s to <vscale x 8 x i8>
   ret <vscale x 8 x i8> %result
 }
@@ -1275,9 +1275,9 @@ define <vscale x 16 x i8> @rhadds_v16i8(<vscale x 16 x i8> %s0, <vscale x 16 x i
 entry:
   %s0s = sext <vscale x 16 x i8> %s0 to <vscale x 16 x i16>
   %s1s = sext <vscale x 16 x i8> %s1 to <vscale x 16 x i16>
-  %add = add <vscale x 16 x i16> %s0s, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %add = add <vscale x 16 x i16> %s0s, splat (i16 1)
   %add2 = add <vscale x 16 x i16> %add, %s1s
-  %s = ashr <vscale x 16 x i16> %add2, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %s = ashr <vscale x 16 x i16> %add2, splat (i16 1)
   %result = trunc <vscale x 16 x i16> %s to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %result
 }
@@ -1301,9 +1301,9 @@ define <vscale x 16 x i8> @rhadds_v16i8_lsh(<vscale x 16 x i8> %s0, <vscale x 16
 entry:
   %s0s = sext <vscale x 16 x i8> %s0 to <vscale x 16 x i16>
   %s1s = sext <vscale x 16 x i8> %s1 to <vscale x 16 x i16>
-  %add = add <vscale x 16 x i16> %s0s, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %add = add <vscale x 16 x i16> %s0s, splat (i16 1)
   %add2 = add <vscale x 16 x i16> %add, %s1s
-  %s = lshr <vscale x 16 x i16> %add2, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %s = lshr <vscale x 16 x i16> %add2, splat (i16 1)
   %result = trunc <vscale x 16 x i16> %s to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %result
 }
@@ -1327,9 +1327,9 @@ define <vscale x 16 x i8> @rhaddu_v16i8(<vscale x 16 x i8> %s0, <vscale x 16 x i
 entry:
   %s0s = zext <vscale x 16 x i8> %s0 to <vscale x 16 x i16>
   %s1s = zext <vscale x 16 x i8> %s1 to <vscale x 16 x i16>
-  %add = add nuw nsw <vscale x 16 x i16> %s0s, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %add = add nuw nsw <vscale x 16 x i16> %s0s, splat (i16 1)
   %add2 = add nuw nsw <vscale x 16 x i16> %add, %s1s
-  %s = lshr <vscale x 16 x i16> %add2, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %s = lshr <vscale x 16 x i16> %add2, splat (i16 1)
   %result = trunc <vscale x 16 x i16> %s to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %result
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll
index 3ccbd5884740..c0ddceb42e1d 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll
@@ -771,7 +771,7 @@ define <vscale x 4 x i32> @sdiv_const(<vscale x 4 x i32> %a) #0 {
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
-  %div = sdiv <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 3, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+  %div = sdiv <vscale x 4 x i32> %a, splat (i32 3)
   ret <vscale x 4 x i32> %div
 }
 
@@ -783,7 +783,7 @@ define <vscale x 4 x i32> @udiv_const(<vscale x 4 x i32> %a) #0 {
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
-  %div = udiv <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 3, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+  %div = udiv <vscale x 4 x i32> %a, splat (i32 3)
   ret <vscale x 4 x i32> %div
 }
 
@@ -795,9 +795,9 @@ define <vscale x 8 x i16> @uqsub(<vscale x 8 x i16> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uqsub z0.h, z0.h, #32768 // =0x8000
 ; CHECK-NEXT:    ret
-  %cmp = icmp slt <vscale x 8 x i16> %a, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> undef, i16 0, i32 0), <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer)
-  %sub = xor <vscale x 8 x i16> %a, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> undef, i16 -32768, i32 0), <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer)
-  %sel = select <vscale x 8 x i1> %cmp, <vscale x 8 x i16> %sub, <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> undef, i16 0, i32 0), <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer)
+  %cmp = icmp slt <vscale x 8 x i16> %a, zeroinitializer
+  %sub = xor <vscale x 8 x i16> %a, splat (i16 -32768)
+  %sel = select <vscale x 8 x i1> %cmp, <vscale x 8 x i16> %sub, <vscale x 8 x i16> zeroinitializer
   ret <vscale x 8 x i16> %sel
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sve-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-int-arith.ll
index cb2b2f34ca5e..fc2672f8c80a 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-arith.ll
@@ -538,7 +538,7 @@ define <vscale x 2 x i64> @mls_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b,
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 2 x i64> %a, %b
-  %2 = add <vscale x 2 x i64> %1, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 4294967295, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %2 = add <vscale x 2 x i64> %1, splat (i64 4294967295)
   ret <vscale x 2 x i64> %2
 }
 
@@ -551,7 +551,7 @@ define <vscale x 2 x i64> @muladd_i64_negativeAddend(<vscale x 2 x i64> %a, <vsc
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 2 x i64> %a, %b
-  %2 = add <vscale x 2 x i64> %1, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 -4294967295, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %2 = add <vscale x 2 x i64> %1, splat (i64 -4294967295)
   ret <vscale x 2 x i64> %2
 }
 
@@ -565,7 +565,7 @@ define <vscale x 4 x i32> @muladd_i32_positiveAddend(<vscale x 4 x i32> %a, <vsc
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 4 x i32> %a, %b
-  %2 = add <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 65536, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %2 = add <vscale x 4 x i32> %1, splat (i32 65536)
   ret <vscale x 4 x i32> %2
 }
 
@@ -578,7 +578,7 @@ define <vscale x 4 x i32> @muladd_i32_negativeAddend(<vscale x 4 x i32> %a, <vsc
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 4 x i32> %a, %b
-  %2 = add <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -65536, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %2 = add <vscale x 4 x i32> %1, splat (i32 -65536)
   ret <vscale x 4 x i32> %2
 }
 
@@ -591,7 +591,7 @@ define <vscale x 8 x i16> @muladd_i16_positiveAddend(<vscale x 8 x i16> %a, <vsc
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 8 x i16> %a, %b
-  %2 = add <vscale x 8 x i16> %1, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 255, i16 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %2 = add <vscale x 8 x i16> %1, splat (i16 255)
   ret <vscale x 8 x i16> %2
 }
 
@@ -604,7 +604,7 @@ define <vscale x 8 x i16> @muladd_i16_negativeAddend(<vscale x 8 x i16> %a, <vsc
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 8 x i16> %a, %b
-  %2 = add <vscale x 8 x i16> %1, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 -255, i16 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %2 = add <vscale x 8 x i16> %1, splat (i16 -255)
   ret <vscale x 8 x i16> %2
 }
 
@@ -617,7 +617,7 @@ define <vscale x 16 x i8> @muladd_i8_positiveAddend(<vscale x 16 x i8> %a, <vsca
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 16 x i8> %a, %b
-  %2 = add <vscale x 16 x i8> %1, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 15, i8 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %2 = add <vscale x 16 x i8> %1, splat (i8 15)
   ret <vscale x 16 x i8> %2
 }
 
@@ -630,7 +630,7 @@ define <vscale x 16 x i8> @muladd_i8_negativeAddend(<vscale x 16 x i8> %a, <vsca
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 16 x i8> %a, %b
-  %2 = add <vscale x 16 x i8> %1, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 -15, i8 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %2 = add <vscale x 16 x i8> %1, splat (i8 -15)
   ret <vscale x 16 x i8> %2
 }
 
@@ -644,7 +644,7 @@ define <vscale x 2 x i64> @mulsub_i64_positiveAddend(<vscale x 2 x i64> %a, <vsc
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 2 x i64> %a, %b
-  %2 = sub <vscale x 2 x i64> %1, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 4294967295, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %2 = sub <vscale x 2 x i64> %1, splat (i64 4294967295)
   ret <vscale x 2 x i64> %2
 }
 
@@ -658,7 +658,7 @@ define <vscale x 2 x i64> @mulsub_i64_negativeAddend(<vscale x 2 x i64> %a, <vsc
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 2 x i64> %a, %b
-  %2 = sub <vscale x 2 x i64> %1, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 -4294967295, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %2 = sub <vscale x 2 x i64> %1, splat (i64 -4294967295)
   ret <vscale x 2 x i64> %2
 }
 
@@ -673,7 +673,7 @@ define <vscale x 4 x i32> @mulsub_i32_positiveAddend(<vscale x 4 x i32> %a, <vsc
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 4 x i32> %a, %b
-  %2 = sub <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 65536, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %2 = sub <vscale x 4 x i32> %1, splat (i32 65536)
   ret <vscale x 4 x i32> %2
 }
 
@@ -687,7 +687,7 @@ define <vscale x 4 x i32> @mulsub_i32_negativeAddend(<vscale x 4 x i32> %a, <vsc
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 4 x i32> %a, %b
-  %2 = sub <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -65536, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %2 = sub <vscale x 4 x i32> %1, splat (i32 -65536)
   ret <vscale x 4 x i32> %2
 }
 
@@ -700,7 +700,7 @@ define <vscale x 8 x i16> @mulsub_i16_positiveAddend(<vscale x 8 x i16> %a, <vsc
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 8 x i16> %a, %b
-  %2 = sub <vscale x 8 x i16> %1, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 255, i16 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %2 = sub <vscale x 8 x i16> %1, splat (i16 255)
   ret <vscale x 8 x i16> %2
 }
 
@@ -714,7 +714,7 @@ define <vscale x 8 x i16> @mulsub_i16_negativeAddend(<vscale x 8 x i16> %a, <vsc
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 8 x i16> %a, %b
-  %2 = sub <vscale x 8 x i16> %1, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 -255, i16 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %2 = sub <vscale x 8 x i16> %1, splat (i16 -255)
   ret <vscale x 8 x i16> %2
 }
 
@@ -727,7 +727,7 @@ define <vscale x 16 x i8> @mulsub_i8_positiveAddend(<vscale x 16 x i8> %a, <vsca
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 16 x i8> %a, %b
-  %2 = sub <vscale x 16 x i8> %1, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 15, i8 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %2 = sub <vscale x 16 x i8> %1, splat (i8 15)
   ret <vscale x 16 x i8> %2
 }
 
@@ -740,7 +740,7 @@ define <vscale x 16 x i8> @mulsub_i8_negativeAddend(<vscale x 16 x i8> %a, <vsca
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 16 x i8> %a, %b
-  %2 = sub <vscale x 16 x i8> %1, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 -15, i8 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %2 = sub <vscale x 16 x i8> %1, splat (i8 -15)
   ret <vscale x 16 x i8> %2
 }
 
@@ -757,7 +757,7 @@ define <vscale x 8 x i16> @multiple_fused_ops(<vscale x 8 x i16> %a, <vscale x 8
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 8 x i16> %a, %b
-  %2 = add  <vscale x 8 x i16> %1, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 200, i16 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %2 = add  <vscale x 8 x i16> %1, splat (i16 200)
   %3 = mul <vscale x 8 x i16> %2, %a
   %4 = sub <vscale x 8 x i16> %3, %b
   ret <vscale x 8 x i16> %4
@@ -805,7 +805,7 @@ vector.body:                                      ; preds = %vector.body, %for.b
   %3 = getelementptr inbounds i32, ptr %src2, i64 %index
   %wide.masked.load12 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %3, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison)
   %4 = mul nsw <vscale x 4 x i32> %wide.masked.load12, %wide.masked.load
-  %5 = add nsw <vscale x 4 x i32> %4, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %5 = add nsw <vscale x 4 x i32> %4, splat (i32 1)
   %6 = getelementptr inbounds i32, ptr %dst, i64 %index
   tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %5, ptr %6, i32 4, <vscale x 4 x i1> %active.lane.mask)
   %index.next = add i64 %index, %1
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll
index 82bf756f8228..c7c102f5d567 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
 
@@ -150,6 +150,46 @@ define <vscale x 16 x i1> @chained_reinterpret() {
   ret <vscale x 16 x i1> %out
 }
 
+define <vscale x 16 x i1> @reinterpret_scalar_bool_h(i1 %x){
+; CHECK-LABEL: reinterpret_scalar_bool_h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sbfx x8, x0, #0, #1
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    ret
+  %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %x, i64 0
+  %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %out = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %.splat)
+  ret <vscale x 16 x i1> %out
+}
+
+define <vscale x 16 x i1> @reinterpret_scalar_bool_s(i1 %x){
+; CHECK-LABEL: reinterpret_scalar_bool_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sbfx x8, x0, #0, #1
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    ret
+  %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %x, i64 0
+  %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %out = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %.splat)
+  ret <vscale x 16 x i1> %out
+}
+
+define <vscale x 16 x i1> @reinterpret_scalar_bool_q(i1 %x){
+; CHECK-LABEL: reinterpret_scalar_bool_q:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sbfx x8, x0, #0, #1
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    ret
+  %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %x, i64 0
+  %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %out = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %.splat)
+  ret <vscale x 16 x i1> %out
+}
+
+
 declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg)
 declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 immarg)
 declare <vscale x 8 x i1> @llvm.aarch64.sve.cmpgt.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
diff --git a/llvm/test/CodeGen/AArch64/sve-knownbits.ll b/llvm/test/CodeGen/AArch64/sve-knownbits.ll
index 2346d6599547..ac391948d6bd 100644
--- a/llvm/test/CodeGen/AArch64/sve-knownbits.ll
+++ b/llvm/test/CodeGen/AArch64/sve-knownbits.ll
@@ -6,8 +6,8 @@ define <vscale x 8 x i16> @test_knownzero(<vscale x 8 x i16> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    ret
-  %a1 = shl <vscale x 8 x i16> %x, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 8, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
-  %a2 = and <vscale x 8 x i16> %a1, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 8, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %a1 = shl <vscale x 8 x i16> %x, splat (i16 8)
+  %a2 = and <vscale x 8 x i16> %a1, splat (i16 8)
   ret <vscale x 8 x i16> %a2
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll
index 539f443de18a..600e9c4805ff 100644
--- a/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll
+++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll
@@ -281,7 +281,7 @@ define <vscale x 4 x i32> @andnot_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x i32
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp eq <vscale x 4 x i32> %z, zeroinitializer
-  %y1 = xor <vscale x 4 x i32> %y, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %y1 = xor <vscale x 4 x i32> %y, splat (i32 -1)
   %a = and <vscale x 4 x i32> %x, %y1
   %b = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %a, <vscale x 4 x i32> %z
   ret <vscale x 4 x i32> %b
@@ -297,7 +297,7 @@ define <vscale x 8 x i16> @andnot_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x i16
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp eq <vscale x 8 x i16> %z, zeroinitializer
-  %y1 = xor <vscale x 8 x i16> %y, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 -1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %y1 = xor <vscale x 8 x i16> %y, splat (i16 -1)
   %a = and <vscale x 8 x i16> %x, %y1
   %b = select <vscale x 8 x i1> %c, <vscale x 8 x i16> %a, <vscale x 8 x i16> %z
   ret <vscale x 8 x i16> %b
@@ -313,7 +313,7 @@ define <vscale x 16 x i8> @andnot_v16i8(<vscale x 16 x i8> %z, <vscale x 16 x i8
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp eq <vscale x 16 x i8> %z, zeroinitializer
-  %y1 = xor <vscale x 16 x i8> %y, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 -1, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %y1 = xor <vscale x 16 x i8> %y, splat (i8 -1)
   %a = and <vscale x 16 x i8> %x, %y1
   %b = select <vscale x 16 x i1> %c, <vscale x 16 x i8> %a, <vscale x 16 x i8> %z
   ret <vscale x 16 x i8> %b
@@ -331,7 +331,7 @@ define <vscale x 4 x i32> @ornot_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x i32>
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp eq <vscale x 4 x i32> %z, zeroinitializer
-  %y1 = xor <vscale x 4 x i32> %y, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %y1 = xor <vscale x 4 x i32> %y, splat (i32 -1)
   %a = or <vscale x 4 x i32> %x, %y1
   %b = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %a, <vscale x 4 x i32> %z
   ret <vscale x 4 x i32> %b
@@ -349,7 +349,7 @@ define <vscale x 8 x i16> @ornot_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x i16>
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp eq <vscale x 8 x i16> %z, zeroinitializer
-  %y1 = xor <vscale x 8 x i16> %y, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 -1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %y1 = xor <vscale x 8 x i16> %y, splat (i16 -1)
   %a = or <vscale x 8 x i16> %x, %y1
   %b = select <vscale x 8 x i1> %c, <vscale x 8 x i16> %a, <vscale x 8 x i16> %z
   ret <vscale x 8 x i16> %b
@@ -367,7 +367,7 @@ define <vscale x 16 x i8> @ornot_v16i8(<vscale x 16 x i8> %z, <vscale x 16 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp eq <vscale x 16 x i8> %z, zeroinitializer
-  %y1 = xor <vscale x 16 x i8> %y, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 -1, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %y1 = xor <vscale x 16 x i8> %y, splat (i8 -1)
   %a = or <vscale x 16 x i8> %x, %y1
   %b = select <vscale x 16 x i1> %c, <vscale x 16 x i8> %a, <vscale x 16 x i8> %z
   ret <vscale x 16 x i8> %b
diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll
index 4413dcd89f48..0f09f7dac298 100644
--- a/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll
+++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll
@@ -122,7 +122,7 @@ define <vscale x 2 x i64> @mul_nxv2i64_x(<vscale x 2 x i64> %x, <vscale x 2 x i6
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 2 x i64> %n, zeroinitializer
-  %a = select <vscale x 2 x i1> %c, <vscale x 2 x i64> %y, <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %a = select <vscale x 2 x i1> %c, <vscale x 2 x i64> %y, <vscale x 2 x i64> splat (i64 1)
   %b = mul <vscale x 2 x i64> %a, %x
   ret <vscale x 2 x i64> %b
 }
@@ -136,7 +136,7 @@ define <vscale x 4 x i32> @mul_nxv4i32_x(<vscale x 4 x i32> %x, <vscale x 4 x i3
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 4 x i32> %n, zeroinitializer
-  %a = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %y, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %a = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %y, <vscale x 4 x i32> splat (i32 1)
   %b = mul <vscale x 4 x i32> %a, %x
   ret <vscale x 4 x i32> %b
 }
@@ -150,7 +150,7 @@ define <vscale x 8 x i16> @mul_nxv8i16_x(<vscale x 8 x i16> %x, <vscale x 8 x i1
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 8 x i16> %n, zeroinitializer
-  %a = select <vscale x 8 x i1> %c, <vscale x 8 x i16> %y, <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %a = select <vscale x 8 x i1> %c, <vscale x 8 x i16> %y, <vscale x 8 x i16> splat (i16 1)
   %b = mul <vscale x 8 x i16> %a, %x
   ret <vscale x 8 x i16> %b
 }
@@ -164,7 +164,7 @@ define <vscale x 16 x i8> @mul_nxv16i8_x(<vscale x 16 x i8> %x, <vscale x 16 x i
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 16 x i8> %n, zeroinitializer
-  %a = select <vscale x 16 x i1> %c, <vscale x 16 x i8> %y, <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %a = select <vscale x 16 x i1> %c, <vscale x 16 x i8> %y, <vscale x 16 x i8> splat (i8 1)
   %b = mul <vscale x 16 x i8> %a, %x
   ret <vscale x 16 x i8> %b
 }
@@ -178,7 +178,7 @@ define <vscale x 2 x i64> @and_nxv2i64_x(<vscale x 2 x i64> %x, <vscale x 2 x i6
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 2 x i64> %n, zeroinitializer
-  %a = select <vscale x 2 x i1> %c, <vscale x 2 x i64> %y, <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 -1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %a = select <vscale x 2 x i1> %c, <vscale x 2 x i64> %y, <vscale x 2 x i64> splat (i64 -1)
   %b = and <vscale x 2 x i64> %a, %x
   ret <vscale x 2 x i64> %b
 }
@@ -192,7 +192,7 @@ define <vscale x 4 x i32> @and_nxv4i32_x(<vscale x 4 x i32> %x, <vscale x 4 x i3
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 4 x i32> %n, zeroinitializer
-  %a = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %y, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %a = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %y, <vscale x 4 x i32> splat (i32 -1)
   %b = and <vscale x 4 x i32> %a, %x
   ret <vscale x 4 x i32> %b
 }
@@ -206,7 +206,7 @@ define <vscale x 8 x i16> @and_nxv8i16_x(<vscale x 8 x i16> %x, <vscale x 8 x i1
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 8 x i16> %n, zeroinitializer
-  %a = select <vscale x 8 x i1> %c, <vscale x 8 x i16> %y, <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 -1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %a = select <vscale x 8 x i1> %c, <vscale x 8 x i16> %y, <vscale x 8 x i16> splat (i16 -1)
   %b = and <vscale x 8 x i16> %a, %x
   ret <vscale x 8 x i16> %b
 }
@@ -220,7 +220,7 @@ define <vscale x 16 x i8> @and_nxv16i8_x(<vscale x 16 x i8> %x, <vscale x 16 x i
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 16 x i8> %n, zeroinitializer
-  %a = select <vscale x 16 x i1> %c, <vscale x 16 x i8> %y, <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 -1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %a = select <vscale x 16 x i1> %c, <vscale x 16 x i8> %y, <vscale x 16 x i8> splat (i8 -1)
   %b = and <vscale x 16 x i8> %a, %x
   ret <vscale x 16 x i8> %b
 }
@@ -647,7 +647,7 @@ define <vscale x 4 x float> @fadd_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
-  %a = select <vscale x 4 x i1> %c, <vscale x 4 x float> %y, <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+  %a = select <vscale x 4 x i1> %c, <vscale x 4 x float> %y, <vscale x 4 x float> splat (float -0.000000e+00)
   %b = fadd <vscale x 4 x float> %a, %x
   ret <vscale x 4 x float> %b
 }
@@ -662,7 +662,7 @@ define <vscale x 8 x half> @fadd_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
-  %a = select <vscale x 8 x i1> %c, <vscale x 8 x half> %y, <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH8000, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer)
+  %a = select <vscale x 8 x i1> %c, <vscale x 8 x half> %y, <vscale x 8 x half> splat (half 0xH8000)
   %b = fadd <vscale x 8 x half> %a, %x
   ret <vscale x 8 x half> %b
 }
@@ -677,7 +677,7 @@ define <vscale x 2 x double> @fadd_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
-  %a = select <vscale x 2 x i1> %c, <vscale x 2 x double> %y, <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double -0.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %a = select <vscale x 2 x i1> %c, <vscale x 2 x double> %y, <vscale x 2 x double> splat (double -0.000000e+00)
   %b = fadd <vscale x 2 x double> %a, %x
   ret <vscale x 2 x double> %b
 }
@@ -737,7 +737,7 @@ define <vscale x 4 x float> @fmul_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
-  %a = select <vscale x 4 x i1> %c, <vscale x 4 x float> %y, <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+  %a = select <vscale x 4 x i1> %c, <vscale x 4 x float> %y, <vscale x 4 x float> splat (float 1.000000e+00)
   %b = fmul <vscale x 4 x float> %a, %x
   ret <vscale x 4 x float> %b
 }
@@ -752,7 +752,7 @@ define <vscale x 8 x half> @fmul_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
-  %a = select <vscale x 8 x i1> %c, <vscale x 8 x half> %y, <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH3C00, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer)
+  %a = select <vscale x 8 x i1> %c, <vscale x 8 x half> %y, <vscale x 8 x half> splat (half 0xH3C00)
   %b = fmul <vscale x 8 x half> %a, %x
   ret <vscale x 8 x half> %b
 }
@@ -767,7 +767,7 @@ define <vscale x 2 x double> @fmul_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
-  %a = select <vscale x 2 x i1> %c, <vscale x 2 x double> %y, <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %a = select <vscale x 2 x i1> %c, <vscale x 2 x double> %y, <vscale x 2 x double> splat (double 1.000000e+00)
   %b = fmul <vscale x 2 x double> %a, %x
   ret <vscale x 2 x double> %b
 }
@@ -783,7 +783,7 @@ define <vscale x 4 x float> @fdiv_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
-  %a = select <vscale x 4 x i1> %c, <vscale x 4 x float> %y, <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+  %a = select <vscale x 4 x i1> %c, <vscale x 4 x float> %y, <vscale x 4 x float> splat (float 1.000000e+00)
   %b = fdiv <vscale x 4 x float> %x, %a
   ret <vscale x 4 x float> %b
 }
@@ -799,7 +799,7 @@ define <vscale x 8 x half> @fdiv_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
-  %a = select <vscale x 8 x i1> %c, <vscale x 8 x half> %y, <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH3C00, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer)
+  %a = select <vscale x 8 x i1> %c, <vscale x 8 x half> %y, <vscale x 8 x half> splat (half 0xH3C00)
   %b = fdiv <vscale x 8 x half> %x, %a
   ret <vscale x 8 x half> %b
 }
@@ -815,7 +815,7 @@ define <vscale x 2 x double> @fdiv_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
-  %a = select <vscale x 2 x i1> %c, <vscale x 2 x double> %y, <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %a = select <vscale x 2 x i1> %c, <vscale x 2 x double> %y, <vscale x 2 x double> splat (double 1.000000e+00)
   %b = fdiv <vscale x 2 x double> %x, %a
   ret <vscale x 2 x double> %b
 }
@@ -831,7 +831,7 @@ define <vscale x 4 x float> @fma_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
   %m = fmul fast <vscale x 4 x float> %y, %z
-  %a = select <vscale x 4 x i1> %c, <vscale x 4 x float> %m, <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+  %a = select <vscale x 4 x i1> %c, <vscale x 4 x float> %m, <vscale x 4 x float> splat (float -0.000000e+00)
   %b = fadd fast <vscale x 4 x float> %a, %x
   ret <vscale x 4 x float> %b
 }
@@ -847,7 +847,7 @@ define <vscale x 8 x half> @fma_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
   %m = fmul fast <vscale x 8 x half> %y, %z
-  %a = select <vscale x 8 x i1> %c, <vscale x 8 x half> %m, <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH8000, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer)
+  %a = select <vscale x 8 x i1> %c, <vscale x 8 x half> %m, <vscale x 8 x half> splat (half 0xH8000)
   %b = fadd fast <vscale x 8 x half> %a, %x
   ret <vscale x 8 x half> %b
 }
@@ -863,7 +863,7 @@ define <vscale x 2 x double> @fma_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
   %m = fmul fast <vscale x 2 x double> %y, %z
-  %a = select <vscale x 2 x i1> %c, <vscale x 2 x double> %m, <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double -0.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %a = select <vscale x 2 x i1> %c, <vscale x 2 x double> %m, <vscale x 2 x double> splat (double -0.000000e+00)
   %b = fadd fast <vscale x 2 x double> %a, %x
   ret <vscale x 2 x double> %b
 }
@@ -998,7 +998,7 @@ define <vscale x 2 x i64> @mul_nxv2i64_y(<vscale x 2 x i64> %x, <vscale x 2 x i6
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 2 x i64> %n, zeroinitializer
-  %a = select <vscale x 2 x i1> %c, <vscale x 2 x i64> %x, <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %a = select <vscale x 2 x i1> %c, <vscale x 2 x i64> %x, <vscale x 2 x i64> splat (i64 1)
   %b = mul <vscale x 2 x i64> %a, %y
   ret <vscale x 2 x i64> %b
 }
@@ -1013,7 +1013,7 @@ define <vscale x 4 x i32> @mul_nxv4i32_y(<vscale x 4 x i32> %x, <vscale x 4 x i3
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 4 x i32> %n, zeroinitializer
-  %a = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %x, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %a = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %x, <vscale x 4 x i32> splat (i32 1)
   %b = mul <vscale x 4 x i32> %a, %y
   ret <vscale x 4 x i32> %b
 }
@@ -1028,7 +1028,7 @@ define <vscale x 8 x i16> @mul_nxv8i16_y(<vscale x 8 x i16> %x, <vscale x 8 x i1
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 8 x i16> %n, zeroinitializer
-  %a = select <vscale x 8 x i1> %c, <vscale x 8 x i16> %x, <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %a = select <vscale x 8 x i1> %c, <vscale x 8 x i16> %x, <vscale x 8 x i16> splat (i16 1)
   %b = mul <vscale x 8 x i16> %a, %y
   ret <vscale x 8 x i16> %b
 }
@@ -1043,7 +1043,7 @@ define <vscale x 16 x i8> @mul_nxv16i8_y(<vscale x 16 x i8> %x, <vscale x 16 x i
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 16 x i8> %n, zeroinitializer
-  %a = select <vscale x 16 x i1> %c, <vscale x 16 x i8> %x, <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %a = select <vscale x 16 x i1> %c, <vscale x 16 x i8> %x, <vscale x 16 x i8> splat (i8 1)
   %b = mul <vscale x 16 x i8> %a, %y
   ret <vscale x 16 x i8> %b
 }
@@ -1058,7 +1058,7 @@ define <vscale x 2 x i64> @and_nxv2i64_y(<vscale x 2 x i64> %x, <vscale x 2 x i6
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 2 x i64> %n, zeroinitializer
-  %a = select <vscale x 2 x i1> %c, <vscale x 2 x i64> %x, <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 -1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %a = select <vscale x 2 x i1> %c, <vscale x 2 x i64> %x, <vscale x 2 x i64> splat (i64 -1)
   %b = and <vscale x 2 x i64> %a, %y
   ret <vscale x 2 x i64> %b
 }
@@ -1073,7 +1073,7 @@ define <vscale x 4 x i32> @and_nxv4i32_y(<vscale x 4 x i32> %x, <vscale x 4 x i3
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 4 x i32> %n, zeroinitializer
-  %a = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %x, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %a = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %x, <vscale x 4 x i32> splat (i32 -1)
   %b = and <vscale x 4 x i32> %a, %y
   ret <vscale x 4 x i32> %b
 }
@@ -1088,7 +1088,7 @@ define <vscale x 8 x i16> @and_nxv8i16_y(<vscale x 8 x i16> %x, <vscale x 8 x i1
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 8 x i16> %n, zeroinitializer
-  %a = select <vscale x 8 x i1> %c, <vscale x 8 x i16> %x, <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 -1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %a = select <vscale x 8 x i1> %c, <vscale x 8 x i16> %x, <vscale x 8 x i16> splat (i16 -1)
   %b = and <vscale x 8 x i16> %a, %y
   ret <vscale x 8 x i16> %b
 }
@@ -1103,7 +1103,7 @@ define <vscale x 16 x i8> @and_nxv16i8_y(<vscale x 16 x i8> %x, <vscale x 16 x i
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 16 x i8> %n, zeroinitializer
-  %a = select <vscale x 16 x i1> %c, <vscale x 16 x i8> %x, <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 -1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %a = select <vscale x 16 x i1> %c, <vscale x 16 x i8> %x, <vscale x 16 x i8> splat (i8 -1)
   %b = and <vscale x 16 x i8> %a, %y
   ret <vscale x 16 x i8> %b
 }
@@ -1547,7 +1547,7 @@ define <vscale x 4 x float> @fadd_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
-  %a = select <vscale x 4 x i1> %c, <vscale x 4 x float> %x, <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+  %a = select <vscale x 4 x i1> %c, <vscale x 4 x float> %x, <vscale x 4 x float> splat (float -0.000000e+00)
   %b = fadd <vscale x 4 x float> %a, %y
   ret <vscale x 4 x float> %b
 }
@@ -1563,7 +1563,7 @@ define <vscale x 8 x half> @fadd_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
-  %a = select <vscale x 8 x i1> %c, <vscale x 8 x half> %x, <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH8000, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer)
+  %a = select <vscale x 8 x i1> %c, <vscale x 8 x half> %x, <vscale x 8 x half> splat (half 0xH8000)
   %b = fadd <vscale x 8 x half> %a, %y
   ret <vscale x 8 x half> %b
 }
@@ -1579,7 +1579,7 @@ define <vscale x 2 x double> @fadd_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
-  %a = select <vscale x 2 x i1> %c, <vscale x 2 x double> %x, <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double -0.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %a = select <vscale x 2 x i1> %c, <vscale x 2 x double> %x, <vscale x 2 x double> splat (double -0.000000e+00)
   %b = fadd <vscale x 2 x double> %a, %y
   ret <vscale x 2 x double> %b
 }
@@ -1643,7 +1643,7 @@ define <vscale x 4 x float> @fmul_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
-  %a = select <vscale x 4 x i1> %c, <vscale x 4 x float> %x, <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+  %a = select <vscale x 4 x i1> %c, <vscale x 4 x float> %x, <vscale x 4 x float> splat (float 1.000000e+00)
   %b = fmul <vscale x 4 x float> %a, %y
   ret <vscale x 4 x float> %b
 }
@@ -1659,7 +1659,7 @@ define <vscale x 8 x half> @fmul_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
-  %a = select <vscale x 8 x i1> %c, <vscale x 8 x half> %x, <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH3C00, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer)
+  %a = select <vscale x 8 x i1> %c, <vscale x 8 x half> %x, <vscale x 8 x half> splat (half 0xH3C00)
   %b = fmul <vscale x 8 x half> %a, %y
   ret <vscale x 8 x half> %b
 }
@@ -1675,7 +1675,7 @@ define <vscale x 2 x double> @fmul_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
-  %a = select <vscale x 2 x i1> %c, <vscale x 2 x double> %x, <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %a = select <vscale x 2 x i1> %c, <vscale x 2 x double> %x, <vscale x 2 x double> splat (double 1.000000e+00)
   %b = fmul <vscale x 2 x double> %a, %y
   ret <vscale x 2 x double> %b
 }
@@ -1840,7 +1840,7 @@ define <vscale x 4 x i32> @mul_nxv4i32_multiuse_x(<vscale x 4 x i32> %x, <vscale
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 4 x i32> %n, zeroinitializer
-  %a = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %y, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %a = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %y, <vscale x 4 x i32> splat (i32 1)
   store <vscale x 4 x i32> %a, ptr %p
   %b = mul <vscale x 4 x i32> %a, %x
   ret <vscale x 4 x i32> %b
diff --git a/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll
index d06c13e47bef..4607f225f81e 100644
--- a/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll
@@ -9,7 +9,7 @@ define <vscale x 16 x i8> @sdiv_i8(<vscale x 16 x i8> %a) #0 {
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    asrd z0.b, p0/m, z0.b, #4
 ; CHECK-NEXT:    ret
-  %out = sdiv <vscale x 16 x i8> %a, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 16, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %out = sdiv <vscale x 16 x i8> %a, splat (i8 16)
   ret <vscale x 16 x i8> %out
 }
 
@@ -20,7 +20,7 @@ define <vscale x 16 x i8> @sdiv_i8_neg(<vscale x 16 x i8> %a) #0 {
 ; CHECK-NEXT:    asrd z0.b, p0/m, z0.b, #6
 ; CHECK-NEXT:    subr z0.b, z0.b, #0 // =0x0
 ; CHECK-NEXT:    ret
-  %out = sdiv <vscale x 16 x i8> %a, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 -64, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %out = sdiv <vscale x 16 x i8> %a, splat (i8 -64)
   ret <vscale x 16 x i8> %out
 }
 
@@ -30,7 +30,7 @@ define <vscale x 8 x i16> @sdiv_i16(<vscale x 8 x i16> %a) #0 {
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #10
 ; CHECK-NEXT:    ret
-  %out = sdiv <vscale x 8 x i16> %a, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1024, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %out = sdiv <vscale x 8 x i16> %a, splat (i16 1024)
   ret <vscale x 8 x i16> %out
 }
 
@@ -41,7 +41,7 @@ define <vscale x 8 x i16> @sdiv_i16_neg(<vscale x 8 x i16> %a) #0 {
 ; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #12
 ; CHECK-NEXT:    subr z0.h, z0.h, #0 // =0x0
 ; CHECK-NEXT:    ret
-  %out = sdiv <vscale x 8 x i16> %a, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 -4096, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %out = sdiv <vscale x 8 x i16> %a, splat (i16 -4096)
   ret <vscale x 8 x i16> %out
 }
 
@@ -51,7 +51,7 @@ define <vscale x 4 x i32> @sdiv_i32(<vscale x 4 x i32> %a) #0 {
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #23
 ; CHECK-NEXT:    ret
-  %out = sdiv <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 8388608, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %out = sdiv <vscale x 4 x i32> %a, splat (i32 8388608)
   ret <vscale x 4 x i32> %out
 }
 
@@ -62,7 +62,7 @@ define <vscale x 4 x i32> @sdiv_i32_neg(<vscale x 4 x i32> %a) #0 {
 ; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #25
 ; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
 ; CHECK-NEXT:    ret
-  %out = sdiv <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -33554432, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %out = sdiv <vscale x 4 x i32> %a, splat (i32 -33554432)
   ret <vscale x 4 x i32> %out
 }
 
@@ -72,7 +72,7 @@ define <vscale x 2 x i64> @sdiv_i64(<vscale x 2 x i64> %a) #0 {
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    asrd z0.d, p0/m, z0.d, #53
 ; CHECK-NEXT:    ret
-  %out = sdiv <vscale x 2 x i64> %a, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 9007199254740992, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %out = sdiv <vscale x 2 x i64> %a, splat (i64 9007199254740992)
   ret <vscale x 2 x i64> %out
 }
 
@@ -83,7 +83,7 @@ define <vscale x 2 x i64> @sdiv_i64_neg(<vscale x 2 x i64> %a) #0 {
 ; CHECK-NEXT:    asrd z0.d, p0/m, z0.d, #55
 ; CHECK-NEXT:    subr z0.d, z0.d, #0 // =0x0
 ; CHECK-NEXT:    ret
-  %out = sdiv <vscale x 2 x i64> %a, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 -36028797018963968, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %out = sdiv <vscale x 2 x i64> %a, splat (i64 -36028797018963968)
   ret <vscale x 2 x i64> %out
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sve-splat-sext.ll b/llvm/test/CodeGen/AArch64/sve-splat-sext.ll
index fb71ab1d5eeb..f689aa6469a2 100644
--- a/llvm/test/CodeGen/AArch64/sve-splat-sext.ll
+++ b/llvm/test/CodeGen/AArch64/sve-splat-sext.ll
@@ -8,8 +8,8 @@ define <vscale x 8 x i16> @sext_splat_v8i16_128() {
 ; CHECK-NEXT:    ret
   %i = insertelement <vscale x 8 x i16> poison, i16 128, i32 0
   %s = shufflevector <vscale x 8 x i16> %i, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
-  %a = shl <vscale x 8 x i16> %s, shufflevector (<vscale x 8 x i16> insertelement(<vscale x 8 x i16> undef, i16 8, i32 0), <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer)
-  %b = ashr <vscale x 8 x i16> %a, shufflevector (<vscale x 8 x i16> insertelement(<vscale x 8 x i16> undef, i16 8, i32 0), <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer)
+  %a = shl <vscale x 8 x i16> %s, splat (i16 8)
+  %b = ashr <vscale x 8 x i16> %a, splat (i16 8)
   ret <vscale x 8 x i16> %b
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll b/llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll
index d001ae9f7712..9c3d4b1e5a81 100644
--- a/llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll
+++ b/llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll
@@ -12,7 +12,7 @@ define <vscale x 4 x i32> @srem_combine_loop(<vscale x 4 x i32> %a) #0 {
 ; CHECK-NEXT:    asrd z1.s, p0/m, z1.s, #1
 ; CHECK-NEXT:    mls z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    ret
-  %rem = srem <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %rem = srem <vscale x 4 x i32> %a, splat (i32 2)
   ret <vscale x 4 x i32> %rem
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll
index 0afd11d098a0..02b9562b8f52 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll
@@ -10,8 +10,8 @@ define void @add_lshr_rshrnb_b_6(ptr %ptr, ptr %dst, i64 %index){
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1, x2]
 ; CHECK-NEXT:    ret
   %load = load <vscale x 8 x i16>, ptr %ptr, align 2
-  %1 = add <vscale x 8 x i16> %load, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 32, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
-  %2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 6, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
+  %1 = add <vscale x 8 x i16> %load, trunc (<vscale x 8 x i32> splat (i32 32) to <vscale x 8 x i16>)
+  %2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> splat (i32 6) to <vscale x 8 x i16>)
   %3 = trunc <vscale x 8 x i16> %2 to <vscale x 8 x i8>
   %4 = getelementptr inbounds i8, ptr %dst, i64 %index
   store <vscale x 8 x i8> %3, ptr %4, align 1
@@ -28,8 +28,8 @@ define void @neg_add_lshr_rshrnb_b_6(ptr %ptr, ptr %dst, i64 %index){
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1, x2]
 ; CHECK-NEXT:    ret
   %load = load <vscale x 8 x i16>, ptr %ptr, align 2
-  %1 = add <vscale x 8 x i16> %load, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
-  %2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 6, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
+  %1 = add <vscale x 8 x i16> %load, trunc (<vscale x 8 x i32> splat (i32 1) to <vscale x 8 x i16>)
+  %2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> splat (i32 6) to <vscale x 8 x i16>)
   %3 = trunc <vscale x 8 x i16> %2 to <vscale x 8 x i8>
   %4 = getelementptr inbounds i8, ptr %dst, i64 %index
   store <vscale x 8 x i8> %3, ptr %4, align 1
@@ -45,8 +45,8 @@ define void @add_lshr_rshrnb_h_7(ptr %ptr, ptr %dst, i64 %index){
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1, x2]
 ; CHECK-NEXT:    ret
   %load = load <vscale x 8 x i16>, ptr %ptr, align 2
-  %1 = add <vscale x 8 x i16> %load, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 64, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
-  %2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 7, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
+  %1 = add <vscale x 8 x i16> %load, trunc (<vscale x 8 x i32> splat (i32 64) to <vscale x 8 x i16>)
+  %2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> splat (i32 7) to <vscale x 8 x i16>)
   %3 = trunc <vscale x 8 x i16> %2 to <vscale x 8 x i8>
   %4 = getelementptr inbounds i8, ptr %dst, i64 %index
   store <vscale x 8 x i8> %3, ptr %4, align 1
@@ -62,8 +62,8 @@ define void @add_lshr_rshrn_h_6(ptr %ptr, ptr %dst, i64 %index){
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1, x2, lsl #1]
 ; CHECK-NEXT:    ret
   %load = load <vscale x 4 x i32>, ptr %ptr, align 2
-  %1 = add <vscale x 4 x i32> %load, trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 32, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
-  %2 = lshr <vscale x 4 x i32> %1, trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 6, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
+  %1 = add <vscale x 4 x i32> %load, trunc (<vscale x 4 x i64> splat (i64 32) to <vscale x 4 x i32>)
+  %2 = lshr <vscale x 4 x i32> %1, trunc (<vscale x 4 x i64> splat (i64 6) to <vscale x 4 x i32>)
   %3 = trunc <vscale x 4 x i32> %2 to <vscale x 4 x i16>
   %4 = getelementptr inbounds i16, ptr %dst, i64 %index
   store <vscale x 4 x i16> %3, ptr %4, align 1
@@ -79,8 +79,8 @@ define void @add_lshr_rshrnb_h_2(ptr %ptr, ptr %dst, i64 %index){
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1, x2, lsl #1]
 ; CHECK-NEXT:    ret
   %load = load <vscale x 4 x i32>, ptr %ptr, align 2
-  %1 = add <vscale x 4 x i32> %load, trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 2, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
-  %2 = lshr <vscale x 4 x i32> %1, trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 2, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
+  %1 = add <vscale x 4 x i32> %load, trunc (<vscale x 4 x i64> splat (i64 2) to <vscale x 4 x i32>)
+  %2 = lshr <vscale x 4 x i32> %1, trunc (<vscale x 4 x i64> splat (i64 2) to <vscale x 4 x i32>)
   %3 = trunc <vscale x 4 x i32> %2 to <vscale x 4 x i16>
   %4 = getelementptr inbounds i16, ptr %dst, i64 %index
   store <vscale x 4 x i16> %3, ptr %4, align 1
@@ -92,8 +92,8 @@ define void @neg_add_lshr_rshrnb_h_0(ptr %ptr, ptr %dst, i64 %index){
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
   %load = load <vscale x 4 x i32>, ptr %ptr, align 2
-  %1 = add <vscale x 4 x i32> %load, trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
-  %2 = lshr <vscale x 4 x i32> %1, trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 -1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
+  %1 = add <vscale x 4 x i32> %load, trunc (<vscale x 4 x i64> splat (i64 1) to <vscale x 4 x i32>)
+  %2 = lshr <vscale x 4 x i32> %1, trunc (<vscale x 4 x i64> splat (i64 -1) to <vscale x 4 x i32>)
   %3 = trunc <vscale x 4 x i32> %2 to <vscale x 4 x i16>
   %4 = getelementptr inbounds i16, ptr %dst, i64 %index
   store <vscale x 4 x i16> %3, ptr %4, align 1
@@ -109,8 +109,8 @@ define void @neg_zero_shift(ptr %ptr, ptr %dst, i64 %index){
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1, x2, lsl #1]
 ; CHECK-NEXT:    ret
   %load = load <vscale x 4 x i32>, ptr %ptr, align 2
-  %1 = add <vscale x 4 x i32> %load, trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
-  %2 = lshr <vscale x 4 x i32> %1, trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 0, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
+  %1 = add <vscale x 4 x i32> %load, trunc (<vscale x 4 x i64> splat (i64 1) to <vscale x 4 x i32>)
+  %2 = lshr <vscale x 4 x i32> %1, trunc (<vscale x 4 x i64> splat (i64 0) to <vscale x 4 x i32>)
   %3 = trunc <vscale x 4 x i32> %2 to <vscale x 4 x i16>
   %4 = getelementptr inbounds i16, ptr %dst, i64 %index
   store <vscale x 4 x i16> %3, ptr %4, align 1
@@ -128,8 +128,8 @@ define void @wide_add_shift_add_rshrnb_b(ptr %dest, i64 %index, <vscale x 16 x i
 ; CHECK-NEXT:    add z0.b, z1.b, z0.b
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0, x1]
 ; CHECK-NEXT:    ret
-  %1 = add <vscale x 16 x i16> %arg1, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 32, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
-  %2 = lshr <vscale x 16 x i16> %1, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 6, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %1 = add <vscale x 16 x i16> %arg1, splat (i16 32)
+  %2 = lshr <vscale x 16 x i16> %1, splat (i16 6)
   %3 = getelementptr inbounds i8, ptr %dest, i64 %index
   %load = load <vscale x 16 x i8>, ptr %3, align 2
   %4 = trunc <vscale x 16 x i16> %2 to <vscale x 16 x i8>
@@ -149,8 +149,8 @@ define void @wide_add_shift_add_rshrnb_h(ptr %dest, i64 %index, <vscale x 8 x i3
 ; CHECK-NEXT:    add z0.h, z1.h, z0.h
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0, x1, lsl #1]
 ; CHECK-NEXT:    ret
-  %1 = add <vscale x 8 x i32> %arg1, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 32, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
-  %2 = lshr <vscale x 8 x i32> %1, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 6, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %1 = add <vscale x 8 x i32> %arg1, splat (i32 32)
+  %2 = lshr <vscale x 8 x i32> %1, splat (i32 6)
   %3 = getelementptr inbounds i16, ptr %dest, i64 %index
   %load = load <vscale x 8 x i16>, ptr %3, align 2
   %4 = trunc <vscale x 8 x i32> %2 to <vscale x 8 x i16>
@@ -170,8 +170,8 @@ define void @wide_add_shift_add_rshrnb_d(ptr %dest, i64 %index, <vscale x 4 x i6
 ; CHECK-NEXT:    add z0.s, z1.s, z0.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0, x1, lsl #2]
 ; CHECK-NEXT:    ret
-  %1 = add <vscale x 4 x i64> %arg1, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 2147483648, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-  %2 = lshr <vscale x 4 x i64> %1, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 32, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+  %1 = add <vscale x 4 x i64> %arg1, splat (i64 2147483648)
+  %2 = lshr <vscale x 4 x i64> %1, splat (i64 32)
   %3 = getelementptr inbounds i32, ptr %dest, i64 %index
   %load = load <vscale x 4 x i32>, ptr %3, align 4
   %4 = trunc <vscale x 4 x i64> %2 to <vscale x 4 x i32>
@@ -195,8 +195,8 @@ define void @neg_wide_add_shift_add_rshrnb_d(ptr %dest, i64 %index, <vscale x 4
 ; CHECK-NEXT:    add z0.s, z1.s, z0.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0, x1, lsl #2]
 ; CHECK-NEXT:    ret
-  %1 = add <vscale x 4 x i64> %arg1, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 140737488355328, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-  %2 = lshr <vscale x 4 x i64> %1, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 48, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+  %1 = add <vscale x 4 x i64> %arg1, splat (i64 140737488355328)
+  %2 = lshr <vscale x 4 x i64> %1, splat (i64 48)
   %3 = getelementptr inbounds i32, ptr %dest, i64 %index
   %load = load <vscale x 4 x i32>, ptr %3, align 4
   %4 = trunc <vscale x 4 x i64> %2 to <vscale x 4 x i32>
@@ -216,7 +216,7 @@ define void @neg_trunc_lsr_add_op1_not_splat(ptr %ptr, ptr %dst, i64 %index, <vs
 ; CHECK-NEXT:    ret
   %load = load <vscale x 8 x i16>, ptr %ptr, align 2
   %1 = add <vscale x 8 x i16> %load, %add_op1
-  %2 = lshr <vscale x 8 x i16> %1, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 6, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %2 = lshr <vscale x 8 x i16> %1, splat (i16 6)
   %3 = trunc <vscale x 8 x i16> %2 to <vscale x 8 x i8>
   %4 = getelementptr inbounds i8, ptr %dst, i64 %index
   store <vscale x 8 x i8> %3, ptr %4, align 1
@@ -233,7 +233,7 @@ define void @neg_trunc_lsr_op1_not_splat(ptr %ptr, ptr %dst, i64 %index, <vscale
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1, x2]
 ; CHECK-NEXT:    ret
   %load = load <vscale x 8 x i16>, ptr %ptr, align 2
-  %1 = add <vscale x 8 x i16> %load, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 32, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %1 = add <vscale x 8 x i16> %load, splat (i16 32)
   %2 = lshr <vscale x 8 x i16> %1, %lshr_op1
   %3 = trunc <vscale x 8 x i16> %2 to <vscale x 8 x i8>
   %4 = getelementptr inbounds i8, ptr %dst, i64 %index
@@ -253,8 +253,8 @@ define void @neg_add_has_two_uses(ptr %ptr, ptr %dst, ptr %dst2, i64 %index){
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1, x3]
 ; CHECK-NEXT:    ret
   %load = load <vscale x 8 x i16>, ptr %ptr, align 2
-  %1 = add <vscale x 8 x i16> %load, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 32, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
-  %2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 6, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
+  %1 = add <vscale x 8 x i16> %load, trunc (<vscale x 8 x i32> splat (i32 32) to <vscale x 8 x i16>)
+  %2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> splat (i32 6) to <vscale x 8 x i16>)
   %3 = add <vscale x 8 x i16> %1, %1
   %4 = getelementptr inbounds i16, ptr %dst2, i64 %index
   %5 = trunc <vscale x 8 x i16> %2 to <vscale x 8 x i8>
@@ -273,8 +273,8 @@ define void @add_lshr_rshrnb_s(ptr %ptr, ptr %dst, i64 %index){
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x1, x2, lsl #2]
 ; CHECK-NEXT:    ret
   %load = load <vscale x 2 x i64>, ptr %ptr, align 2
-  %1 = add <vscale x 2 x i64> %load, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 32, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-  %2 = lshr <vscale x 2 x i64> %1, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 6, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %1 = add <vscale x 2 x i64> %load, splat (i64 32)
+  %2 = lshr <vscale x 2 x i64> %1, splat (i64 6)
   %3 = trunc <vscale x 2 x i64> %2 to <vscale x 2 x i32>
   %4 = getelementptr inbounds i32, ptr %dst, i64 %index
   store <vscale x 2 x i32> %3, ptr %4, align 1
@@ -291,8 +291,8 @@ define void @neg_add_lshr_rshrnb_s(ptr %ptr, ptr %dst, i64 %index){
 ; CHECK-NEXT:    st1h { z0.d }, p0, [x1, x2, lsl #1]
 ; CHECK-NEXT:    ret
   %load = load <vscale x 2 x i64>, ptr %ptr, align 2
-  %1 = add <vscale x 2 x i64> %load, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 32, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-  %2 = lshr <vscale x 2 x i64> %1, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 6, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %1 = add <vscale x 2 x i64> %load, splat (i64 32)
+  %2 = lshr <vscale x 2 x i64> %1, splat (i64 6)
   %3 = trunc <vscale x 2 x i64> %2 to <vscale x 2 x i16>
   %4 = getelementptr inbounds i16, ptr %dst, i64 %index
   store <vscale x 2 x i16> %3, ptr %4, align 1
@@ -307,8 +307,8 @@ define void @masked_store_rshrnb(ptr %ptr, ptr %dst, i64 %index, <vscale x 8 x i
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1, x2]
 ; CHECK-NEXT:    ret
   %wide.masked.load = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %ptr, i32 2, <vscale x 8 x i1> %mask, <vscale x 8 x i16> poison)
-  %1 = add <vscale x 8 x i16> %wide.masked.load, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 32, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
-  %2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 6, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
+  %1 = add <vscale x 8 x i16> %wide.masked.load, trunc (<vscale x 8 x i32> splat (i32 32) to <vscale x 8 x i16>)
+  %2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> splat (i32 6) to <vscale x 8 x i16>)
   %3 = trunc <vscale x 8 x i16> %2 to <vscale x 8 x i8>
   %4 = getelementptr inbounds i8, ptr %dst, i64 %index
   tail call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> %3, ptr %4, i32 1, <vscale x 8 x i1> %mask)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 86dd1bdd511e..66b49466cc73 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -2182,8 +2182,8 @@ define i32 @test_udot_v24i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #15]
 ; CHECK-GI-DOT-NEXT:    mov v2.b[14], v6.b[0]
 ; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #15]
-; CHECK-GI-DOT-NEXT:    mov v3.d[1], v0.d[0]
-; CHECK-GI-DOT-NEXT:    mov v4.d[1], v0.d[0]
+; CHECK-GI-DOT-NEXT:    fmov d3, d3
+; CHECK-GI-DOT-NEXT:    fmov d4, d4
 ; CHECK-GI-DOT-NEXT:    mov v1.b[15], v5.b[0]
 ; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
 ; CHECK-GI-DOT-NEXT:    mov v2.b[15], v6.b[0]
@@ -2760,8 +2760,8 @@ define i32 @test_sdot_v24i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #15]
 ; CHECK-GI-DOT-NEXT:    mov v2.b[14], v6.b[0]
 ; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #15]
-; CHECK-GI-DOT-NEXT:    mov v3.d[1], v0.d[0]
-; CHECK-GI-DOT-NEXT:    mov v4.d[1], v0.d[0]
+; CHECK-GI-DOT-NEXT:    fmov d3, d3
+; CHECK-GI-DOT-NEXT:    fmov d4, d4
 ; CHECK-GI-DOT-NEXT:    mov v1.b[15], v5.b[0]
 ; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
 ; CHECK-GI-DOT-NEXT:    mov v2.b[15], v6.b[0]
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
index 8988481708cf..d71aed2d1750 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
@@ -187,27 +187,12 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind {
 }
 
 define i32 @test_v3i32(<3 x i32> %a) nounwind {
-; CHECK-SD-LABEL: test_v3i32:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    mov v0.s[3], wzr
-; CHECK-SD-NEXT:    umaxv s0, v0.4s
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_v3i32:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    fmov w9, s2
-; CHECK-GI-NEXT:    fcsel s0, s0, s1, hi
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    fcsel s0, s0, s2, hi
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_v3i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.s[3], wzr
+; CHECK-NEXT:    umaxv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %b = call i32 @llvm.vector.reduce.umax.v3i32(<3 x i32> %a)
   ret i32 %b
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-no-rtn.ll
index 684fd8a86a69..0816eae28f61 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-no-rtn.ll
@@ -19,6 +19,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_offset_no_rtn(float %val, <4 x i32
   ; GFX908_GFX11-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_offset_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0
@@ -32,6 +33,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_offset_no_rtn(float %val, <4 x i32
   ; GFX90A_GFX940-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX90A_GFX940-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
+  ;
   ; GFX12-LABEL: name: buffer_atomic_fadd_f32_offset_no_rtn
   ; GFX12: bb.1 (%ir-block.0):
   ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0
@@ -64,6 +66,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_offen_no_rtn(float %val, <4 x i32>
   ; GFX908_GFX11-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_offen_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -78,6 +81,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_offen_no_rtn(float %val, <4 x i32>
   ; GFX90A_GFX940-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX90A_GFX940-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
+  ;
   ; GFX12-LABEL: name: buffer_atomic_fadd_f32_offen_no_rtn
   ; GFX12: bb.1 (%ir-block.0):
   ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -111,6 +115,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_idxen_no_rtn(float %val, <4 x i32>
   ; GFX908_GFX11-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_idxen_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -125,6 +130,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_idxen_no_rtn(float %val, <4 x i32>
   ; GFX90A_GFX940-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX90A_GFX940-NEXT:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
+  ;
   ; GFX12-LABEL: name: buffer_atomic_fadd_f32_idxen_no_rtn
   ; GFX12: bb.1 (%ir-block.0):
   ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -160,6 +166,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_bothen_no_rtn(float %val, <4 x i32
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_bothen_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
@@ -176,6 +183,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_bothen_no_rtn(float %val, <4 x i32
   ; GFX90A_GFX940-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; GFX90A_GFX940-NEXT:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
+  ;
   ; GFX12-LABEL: name: buffer_atomic_fadd_f32_bothen_no_rtn
   ; GFX12: bb.1 (%ir-block.0):
   ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
@@ -210,6 +218,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_offset_no_rtn(float %val, ptr
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0
@@ -223,6 +232,20 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_offset_no_rtn(float %val, ptr
   ; GFX90A_GFX940-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A_GFX940-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_no_rtn
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX12-NEXT:   BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX12-NEXT:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
   ret void
 }
@@ -242,6 +265,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_offen_no_rtn(float %val, ptr a
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -256,6 +280,21 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_offen_no_rtn(float %val, ptr a
   ; GFX90A_GFX940-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A_GFX940-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_no_rtn
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX12-NEXT:   BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX12-NEXT:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
 }
@@ -275,6 +314,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_idxen_no_rtn(float %val, ptr a
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -289,6 +329,21 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_idxen_no_rtn(float %val, ptr a
   ; GFX90A_GFX940-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A_GFX940-NEXT:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_no_rtn
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX12-NEXT:   BUFFER_ATOMIC_ADD_F32_VBUFFER_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX12-NEXT:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
   ret void
 }
@@ -310,6 +365,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_bothen_no_rtn(float %val, ptr
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
@@ -326,6 +382,23 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_bothen_no_rtn(float %val, ptr
   ; GFX90A_GFX940-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; GFX90A_GFX940-NEXT:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_no_rtn
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX12-NEXT:   BUFFER_ATOMIC_ADD_F32_VBUFFER_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX12-NEXT:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-rtn.ll
index 42d4d21f3108..c0b84c914ce5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-rtn.ll
@@ -19,6 +19,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offset_rtn(float %val, <4 x i32>
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_atomic_fadd_f32_offset_rtn
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0
@@ -33,6 +34,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offset_rtn(float %val, <4 x i32>
   ; GFX11-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX11-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX12-LABEL: name: buffer_atomic_fadd_f32_offset_rtn
   ; GFX12: bb.1 (%ir-block.0):
   ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0
@@ -67,6 +69,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offen_rtn(float %val, <4 x i32> i
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_atomic_fadd_f32_offen_rtn
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -82,6 +85,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offen_rtn(float %val, <4 x i32> i
   ; GFX11-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX11-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX12-LABEL: name: buffer_atomic_fadd_f32_offen_rtn
   ; GFX12: bb.1 (%ir-block.0):
   ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -117,6 +121,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_idxen_rtn(float %val, <4 x i32> i
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_atomic_fadd_f32_idxen_rtn
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -132,6 +137,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_idxen_rtn(float %val, <4 x i32> i
   ; GFX11-NEXT:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX11-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX12-LABEL: name: buffer_atomic_fadd_f32_idxen_rtn
   ; GFX12: bb.1 (%ir-block.0):
   ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -169,6 +175,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_bothen_rtn(float %val, <4 x i32>
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_atomic_fadd_f32_bothen_rtn
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
@@ -186,6 +193,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_bothen_rtn(float %val, <4 x i32>
   ; GFX11-NEXT:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX11-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX12-LABEL: name: buffer_atomic_fadd_f32_bothen_rtn
   ; GFX12: bb.1 (%ir-block.0):
   ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
@@ -222,6 +230,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offset_rtn(float %val, ptr ad
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_rtn
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0
@@ -236,6 +245,21 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offset_rtn(float %val, ptr ad
   ; GFX11-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX11-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
+  ; GFX12-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_rtn
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX12-NEXT:   [[BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFSET_RTN]]
+  ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
   ret float %ret
 }
@@ -256,6 +280,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offen_rtn(float %val, ptr add
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_rtn
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -271,6 +296,22 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offen_rtn(float %val, ptr add
   ; GFX11-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX11-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
+  ; GFX12-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_rtn
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX12-NEXT:   [[BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFEN_RTN]]
+  ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret float %ret
 }
@@ -291,6 +332,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_idxen_rtn(float %val, ptr add
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_rtn
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -306,6 +348,22 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_idxen_rtn(float %val, ptr add
   ; GFX11-NEXT:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX11-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
+  ; GFX12-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_rtn
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX12-NEXT:   [[BUFFER_ATOMIC_ADD_F32_VBUFFER_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_VBUFFER_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_VBUFFER_IDXEN_RTN]]
+  ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
   ret float %ret
 }
@@ -328,6 +386,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_bothen_rtn(float %val, ptr ad
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_rtn
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
@@ -345,6 +404,24 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_bothen_rtn(float %val, ptr ad
   ; GFX11-NEXT:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX11-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
+  ; GFX12-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_rtn
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX12-NEXT:   [[BUFFER_ATOMIC_ADD_F32_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_VBUFFER_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_VBUFFER_BOTHEN_RTN]]
+  ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret float %ret
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-no-rtn.ll
index 1ba27c72803d..9514bea86e4d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-no-rtn.ll
@@ -17,6 +17,7 @@ define amdgpu_ps void @buffer_atomic_fadd_v2f16_offset_no_rtn(<2 x half> %val, <
   ; GFX908-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_offset_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0
@@ -49,6 +50,7 @@ define amdgpu_ps void @buffer_atomic_fadd_v2f16_offen_no_rtn(<2 x half> %val, <4
   ; GFX908-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_offen_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -82,6 +84,7 @@ define amdgpu_ps void @buffer_atomic_fadd_v2f16_idxen_no_rtn(<2 x half> %val, <4
   ; GFX908-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_idxen_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -117,6 +120,7 @@ define amdgpu_ps void @buffer_atomic_fadd_v2f16_bothen_no_rtn(<2 x half> %val, <
   ; GFX908-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_bothen_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
@@ -151,6 +155,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_offset_no_rtn(<2 x half> %va
   ; GFX908-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offset_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0
@@ -183,6 +188,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_offen_no_rtn(<2 x half> %val
   ; GFX908-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offen_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -216,6 +222,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_idxen_no_rtn(<2 x half> %val
   ; GFX908-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_idxen_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -251,6 +258,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_bothen_no_rtn(<2 x half> %va
   ; GFX908-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_bothen_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
index b34c52425758..aa9ebb9226cd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
@@ -14,6 +14,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da
   ; GFX940-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX940-NEXT:   FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
   ; GFX940-NEXT:   S_ENDPGM 0
+  ;
   ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -40,6 +41,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data
   ; GFX940-NEXT:   [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
   ; GFX940-NEXT:   $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
   ; GFX940-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX11-LABEL: name: flat_atomic_fadd_f32_rtn_intrinsic
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -66,6 +68,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(ptr %ptr, float %da
   ; GFX940-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX940-NEXT:   FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr)
   ; GFX940-NEXT:   S_ENDPGM 0
+  ;
   ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_atomicrmw
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -92,6 +95,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_atomicrmw(ptr %ptr, float %data
   ; GFX940-NEXT:   [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr)
   ; GFX940-NEXT:   $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
   ; GFX940-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX11-LABEL: name: flat_atomic_fadd_f32_rtn_atomicrmw
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 921bdb5015c7..63e7339d829e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -256,6 +256,7 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
@@ -607,6 +608,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX12-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 offset:256 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
@@ -921,6 +923,7 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX12-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 offset:16384 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
@@ -1089,6 +1092,7 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 off, v0, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS
@@ -1242,6 +1246,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -1306,6 +1311,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -1389,6 +1395,7 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_mov_b32 s0, 1
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
@@ -1478,6 +1485,7 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_mov_b32 s0, 1
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll
index 3dea9777cfda..c71beff4dc5c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll
@@ -14,6 +14,7 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_intrinsic(ptr addrspace(1
   ; GFX908-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX908-NEXT:   GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_intrinsic
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -40,6 +41,7 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic(ptr addrs
   ; GFX908-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GFX908-NEXT:   GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
@@ -66,6 +68,7 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat_intrinsic(ptr addrsp
   ; GFX908-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX908-NEXT:   GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat_intrinsic
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -92,6 +95,7 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic(ptr
   ; GFX908-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GFX908-NEXT:   GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
index 6b2e6d8dfdb3..f2fe815a7120 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
@@ -17,6 +17,7 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw
   ; HSA-VI-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
   ; HSA-VI-NEXT:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i8_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -51,6 +52,7 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe
   ; HSA-VI-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
   ; HSA-VI-NEXT:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i8_zext_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -85,6 +87,7 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe
   ; HSA-VI-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s8)
   ; HSA-VI-NEXT:   G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i8_sext_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -119,6 +122,7 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou
   ; HSA-VI-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
   ; HSA-VI-NEXT:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i16_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -153,6 +157,7 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer
   ; HSA-VI-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
   ; HSA-VI-NEXT:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i16_zext_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -187,6 +192,7 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig
   ; HSA-VI-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s16)
   ; HSA-VI-NEXT:   G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i16_sext_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -220,6 +226,7 @@ define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nou
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32), align 8, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -252,6 +259,7 @@ define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) n
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32), align 8, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: f32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -284,6 +292,7 @@ define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<2 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s8>), align 8, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<2 x s8>), [[LOAD]](p1) :: (store (<2 x s8>) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v2i8_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -316,6 +325,7 @@ define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s16>), align 8, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<2 x s16>), [[LOAD]](p1) :: (store (<2 x s16>) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v2i16_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -348,6 +358,7 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32>
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s32>), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<2 x s32>), [[LOAD]](p1) :: (store (<2 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v2i32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -380,6 +391,7 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s32>), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<2 x s32>), [[LOAD]](p1) :: (store (<2 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v2f32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -412,6 +424,7 @@ define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %i
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s8>), align 8, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<3 x s8>), [[LOAD]](p1) :: (store (<3 x s8>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v3i8_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -444,6 +457,7 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16>
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s16>), align 8, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<3 x s16>), [[LOAD]](p1) :: (store (<3 x s16>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v3i16_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -476,6 +490,7 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32>
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s32>), align 16, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<3 x s32>), [[LOAD]](p1) :: (store (<3 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v3i32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -508,6 +523,7 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s32>), align 16, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<3 x s32>), [[LOAD]](p1) :: (store (<3 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v3f32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -540,6 +556,7 @@ define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<4 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s8>), align 8, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<4 x s8>), [[LOAD]](p1) :: (store (<4 x s8>) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v4i8_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -572,6 +589,7 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s16>), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<4 x s16>), [[LOAD]](p1) :: (store (<4 x s16>) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v4i16_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -604,6 +622,7 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32>
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s32>), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<4 x s32>), [[LOAD]](p1) :: (store (<4 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v4i32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -636,6 +655,7 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s32>), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<4 x s32>), [[LOAD]](p1) :: (store (<4 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v4f32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -668,6 +688,7 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<8 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s8>), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<8 x s8>), [[LOAD]](p1) :: (store (<8 x s8>) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v8i8_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -700,6 +721,7 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s16>), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<8 x s16>), [[LOAD]](p1) :: (store (<8 x s16>) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v8i16_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -732,6 +754,7 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s32>), align 16, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<8 x s32>), [[LOAD]](p1) :: (store (<8 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v8i32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -764,6 +787,7 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s32>), align 16, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<8 x s32>), [[LOAD]](p1) :: (store (<8 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v8f32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -796,6 +820,7 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s8>), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<16 x s8>), [[LOAD]](p1) :: (store (<16 x s8>) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v16i8_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -828,6 +853,7 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<16 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s16>), align 16, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<16 x s16>), [[LOAD]](p1) :: (store (<16 x s16>) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v16i16_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -860,6 +886,7 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s32>), align 16, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<16 x s32>), [[LOAD]](p1) :: (store (<16 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v16i32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -892,6 +919,7 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s32>), align 16, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<16 x s32>), [[LOAD]](p1) :: (store (<16 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v16f32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -924,6 +952,7 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s64), [[LOAD]](p1) :: (store (s64) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: kernel_arg_i64
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -955,6 +984,7 @@ define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double  %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s64), [[LOAD]](p1) :: (store (s64) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: f64_kernel_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -987,6 +1017,7 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s1), align 8, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s1), [[LOAD]](p1) :: (store (s1) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i1_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1019,6 +1050,7 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin
   ; HSA-VI-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s1)
   ; HSA-VI-NEXT:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i1_arg_zext_i32
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1053,6 +1085,7 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin
   ; HSA-VI-NEXT:   [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD1]](s1)
   ; HSA-VI-NEXT:   G_STORE [[ZEXT]](s64), [[LOAD]](p1) :: (store (s64) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i1_arg_zext_i64
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1087,6 +1120,7 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin
   ; HSA-VI-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s1)
   ; HSA-VI-NEXT:   G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i1_arg_sext_i32
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1121,6 +1155,7 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin
   ; HSA-VI-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[LOAD1]](s1)
   ; HSA-VI-NEXT:   G_STORE [[SEXT]](s64), [[LOAD]](p1) :: (store (s64) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i1_arg_sext_i64
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1154,6 +1189,7 @@ define amdgpu_kernel void @empty_struct_arg({} %arg0, i32 %arg1) nounwind {
   ; HSA-VI-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; HSA-VI-NEXT:   G_STORE [[LOAD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: empty_struct_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1181,6 +1217,7 @@ define amdgpu_kernel void @empty_array_arg([0 x i8] %arg0, i32 %arg1) nounwind {
   ; HSA-VI-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; HSA-VI-NEXT:   G_STORE [[LOAD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: empty_array_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1232,6 +1269,7 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8 %pad,
   ; HSA-VI-NEXT:   G_STORE [[LOAD3]](s32), [[C5]](p1) :: (volatile store (s32) into `ptr addrspace(1) null`, addrspace 1)
   ; HSA-VI-NEXT:   G_STORE [[LOAD4]](s64), [[C5]](p1) :: (volatile store (s64) into `ptr addrspace(1) null`, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: struct_argument_alignment
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1299,6 +1337,7 @@ define amdgpu_kernel void @pointer_in_struct_argument({ptr addrspace(3), ptr add
   ; HSA-VI-NEXT:   G_STORE [[LOAD3]](p3), [[C5]](p1) :: (volatile store (p3) into `ptr addrspace(1) null`, addrspace 1)
   ; HSA-VI-NEXT:   G_STORE [[LOAD4]](p1234), [[C5]](p1) :: (volatile store (p1234) into `ptr addrspace(1) null`, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: pointer_in_struct_argument
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1364,6 +1403,7 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
   ; HSA-VI-NEXT:   G_STORE [[LOAD2]](s32), [[C4]](p1) :: (volatile store (s32) into `ptr addrspace(1) null`, addrspace 1)
   ; HSA-VI-NEXT:   G_STORE [[LOAD3]](s64), [[C4]](p1) :: (volatile store (s64) into `ptr addrspace(1) null`, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: packed_struct_argument_alignment
   ; LEGACY-MESA-VI: bb.1 (%ir-block.1):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1405,6 +1445,7 @@ define amdgpu_kernel void @unused_i32_arg(ptr addrspace(1) nocapture %out, i32 %
   ; HSA-VI-NEXT: {{  $}}
   ; HSA-VI-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: unused_i32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1431,6 +1472,7 @@ define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out
   ; HSA-VI-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
   ; HSA-VI-NEXT:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_constant_i8_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1466,6 +1508,7 @@ define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %ou
   ; HSA-VI-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
   ; HSA-VI-NEXT:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_constant_i16_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1504,6 +1547,7 @@ define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %ou
   ; HSA-VI-NEXT:   G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_constant_i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1545,6 +1589,7 @@ define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture %
   ; HSA-VI-NEXT:   G_STORE [[LOAD2]](<4 x s32>), [[LOAD]](p1) :: (volatile store (<4 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_constant_v4i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1586,6 +1631,7 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu
   ; HSA-VI-NEXT:   G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_align_constant_i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1627,6 +1673,7 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace
   ; HSA-VI-NEXT:   G_STORE [[LOAD2]](<16 x s32>), [[LOAD]](p1) :: (volatile store (<16 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_natural_align_constant_v16i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.1):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1666,6 +1713,7 @@ define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out,
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p1) :: (dereferenceable "amdgpu-noclobber" load (s32) from %ir.in.byref, addrspace 1)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_global_i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1700,6 +1748,7 @@ define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, p
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p0) :: (dereferenceable load (s32) from %ir.in.byref)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_flat_i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1734,6 +1783,7 @@ define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocaptu
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p6) :: (dereferenceable invariant load (s32) from %ir.in.byref, addrspace 6)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_constant_32bit_i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1768,6 +1818,7 @@ define amdgpu_kernel void @byref_unknown_as_i32_arg(ptr addrspace(1) nocapture %
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p999) :: (dereferenceable load (s32) from %ir.in.byref, addrspace 999)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_unknown_as_i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1803,6 +1854,7 @@ define amdgpu_kernel void @byref_local_i32_arg(ptr addrspace(1) nocapture %out,
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p3) :: (dereferenceable load (s32) from %ir.in.byref, addrspace 3)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_local_i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1844,6 +1896,7 @@ define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocaptu
   ; HSA-VI-NEXT:   G_STORE [[LOAD3]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: multi_byref_constant_i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1885,6 +1938,7 @@ define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref
   ; HSA-VI-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32) from %ir.in.byref, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_constant_i32_arg_offset0
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1913,6 +1967,7 @@ define amdgpu_kernel void @p3i8_arg(ptr addrspace(3) %arg) nounwind {
   ; HSA-VI-NEXT:   [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 9
   ; HSA-VI-NEXT:   G_STORE [[C1]](s8), [[LOAD]](p3) :: (store (s8) into %ir.arg, align 4, addrspace 3)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: p3i8_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1938,6 +1993,7 @@ define amdgpu_kernel void @p1i8_arg(ptr addrspace(1) %arg) nounwind {
   ; HSA-VI-NEXT:   [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 0
   ; HSA-VI-NEXT:   G_STORE [[C]](s8), [[C1]](p3) :: (store (s8) into `ptr addrspace(3) null`, addrspace 3)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: p1i8_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1963,6 +2019,7 @@ define amdgpu_kernel void @v2p1i8_arg(<2 x ptr addrspace(1)> %arg) nounwind {
   ; HSA-VI-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; HSA-VI-NEXT:   G_STORE [[LOAD]](<2 x p1>), [[DEF]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef`, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v2p1i8_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1990,6 +2047,7 @@ define amdgpu_kernel void @v2p3i8_arg(<2 x ptr addrspace(3)> %arg) nounwind {
   ; HSA-VI-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; HSA-VI-NEXT:   G_STORE [[LOAD]](<2 x p3>), [[DEF]](p1) :: (store (<2 x p3>) into `ptr addrspace(1) undef`, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v2p3i8_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -2023,6 +2081,7 @@ define amdgpu_kernel void @v2p1i8_in_struct_arg({ <2 x ptr addrspace(1)>, <2 x p
   ; HSA-VI-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<2 x p3>), [[PTR_ADD2]](p1) :: (store (<2 x p3>) into `ptr addrspace(1) undef` + 16, align 16, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v2p1i8_in_struct_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll
index 27eb0b7682ec..3a31ab4ab9d0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll
@@ -36,8 +36,8 @@ define float @v_constained_fadd_f32_fpexcept_ignore(float %x, float %y) #0 {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK-NEXT:   %2:_(s32) = nofpexcept G_STRICT_FADD [[COPY]], [[COPY1]]
-  ; CHECK-NEXT:   $vgpr0 = COPY %2(s32)
+  ; CHECK-NEXT:   [[STRICT_FADD:%[0-9]+]]:_(s32) = nofpexcept G_STRICT_FADD [[COPY]], [[COPY1]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[STRICT_FADD]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
   %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret float %val
@@ -50,8 +50,8 @@ define float @v_constained_fadd_f32_fpexcept_ignore_flags(float %x, float %y) #0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK-NEXT:   %2:_(s32) = nsz nofpexcept G_STRICT_FADD [[COPY]], [[COPY1]]
-  ; CHECK-NEXT:   $vgpr0 = COPY %2(s32)
+  ; CHECK-NEXT:   [[STRICT_FADD:%[0-9]+]]:_(s32) = nsz nofpexcept G_STRICT_FADD [[COPY]], [[COPY1]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[STRICT_FADD]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
   %val = call nsz float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret float %val
@@ -102,8 +102,8 @@ define <2 x float> @v_constained_fadd_v2f32_fpexcept_ignore(<2 x float> %x, <2 x
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
-  ; CHECK-NEXT:   %6:_(<2 x s32>) = nofpexcept G_STRICT_FADD [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES %6(<2 x s32>)
+  ; CHECK-NEXT:   [[STRICT_FADD:%[0-9]+]]:_(<2 x s32>) = nofpexcept G_STRICT_FADD [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[STRICT_FADD]](<2 x s32>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
@@ -138,8 +138,8 @@ define float @v_constained_fsub_f32_fpexcept_ignore_flags(float %x, float %y) #0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK-NEXT:   %2:_(s32) = nsz nofpexcept G_STRICT_FSUB [[COPY]], [[COPY1]]
-  ; CHECK-NEXT:   $vgpr0 = COPY %2(s32)
+  ; CHECK-NEXT:   [[STRICT_FSUB:%[0-9]+]]:_(s32) = nsz nofpexcept G_STRICT_FSUB [[COPY]], [[COPY1]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[STRICT_FSUB]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
   %val = call nsz float @llvm.experimental.constrained.fsub.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret float %val
@@ -152,8 +152,8 @@ define float @v_constained_fmul_f32_fpexcept_ignore_flags(float %x, float %y) #0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK-NEXT:   %2:_(s32) = nsz nofpexcept G_STRICT_FMUL [[COPY]], [[COPY1]]
-  ; CHECK-NEXT:   $vgpr0 = COPY %2(s32)
+  ; CHECK-NEXT:   [[STRICT_FMUL:%[0-9]+]]:_(s32) = nsz nofpexcept G_STRICT_FMUL [[COPY]], [[COPY1]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[STRICT_FMUL]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
   %val = call nsz float @llvm.experimental.constrained.fmul.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret float %val
@@ -166,8 +166,8 @@ define float @v_constained_fdiv_f32_fpexcept_ignore_flags(float %x, float %y) #0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK-NEXT:   %2:_(s32) = nsz nofpexcept G_STRICT_FDIV [[COPY]], [[COPY1]]
-  ; CHECK-NEXT:   $vgpr0 = COPY %2(s32)
+  ; CHECK-NEXT:   [[STRICT_FDIV:%[0-9]+]]:_(s32) = nsz nofpexcept G_STRICT_FDIV [[COPY]], [[COPY1]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[STRICT_FDIV]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
   %val = call nsz float @llvm.experimental.constrained.fdiv.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret float %val
@@ -180,8 +180,8 @@ define float @v_constained_frem_f32_fpexcept_ignore_flags(float %x, float %y) #0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK-NEXT:   %2:_(s32) = nsz nofpexcept G_STRICT_FREM [[COPY]], [[COPY1]]
-  ; CHECK-NEXT:   $vgpr0 = COPY %2(s32)
+  ; CHECK-NEXT:   [[STRICT_FREM:%[0-9]+]]:_(s32) = nsz nofpexcept G_STRICT_FREM [[COPY]], [[COPY1]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[STRICT_FREM]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
   %val = call nsz float @llvm.experimental.constrained.frem.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret float %val
@@ -195,8 +195,8 @@ define float @v_constained_fma_f32_fpexcept_ignore_flags(float %x, float %y, flo
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; CHECK-NEXT:   %3:_(s32) = nsz nofpexcept G_STRICT_FMA [[COPY]], [[COPY1]], [[COPY2]]
-  ; CHECK-NEXT:   $vgpr0 = COPY %3(s32)
+  ; CHECK-NEXT:   [[STRICT_FMA:%[0-9]+]]:_(s32) = nsz nofpexcept G_STRICT_FMA [[COPY]], [[COPY1]], [[COPY2]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[STRICT_FMA]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
   %val = call nsz float @llvm.experimental.constrained.fma.f32(float %x, float %y, float %z, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret float %val
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-metadata.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-metadata.ll
index 75337b76994e..101bb6c0ed12 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-metadata.ll
@@ -5,12 +5,12 @@
 define i32 @reloc_constant() {
   ; CHECK-LABEL: name: reloc_constant
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   [[INT0:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.reloc.constant), !0
+  ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.reloc.constant), !0
   ; We cannot have any specific metadata check here as ConstantAsMetadata is printed as <raw_ptr_val>
-  ; CHECK:   [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.reloc.constant), <0x{{[0-9a-f]+}}>
-  ; CHECK:   [[SUM:%[0-9]+]]:_(s32) = G_ADD [[INT0]], [[INT1]]
-  ; CHECK:   $vgpr0 = COPY [[SUM]](s32)
-  ; CHECK:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.reloc.constant), <0x{{[0-9a-f]+}}>
+  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[INT]], [[INT1]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[ADD]](s32)
+  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
   %val0 = call i32 @llvm.amdgcn.reloc.constant(metadata !0)
   %val1 = call i32 @llvm.amdgcn.reloc.constant(metadata i32 4)
   %res = add i32 %val0, %val1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll
index 12aa8de2baf4..4d36e0f79701 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll
@@ -30,6 +30,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i3
   ; GFX6-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX6-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX6-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX10NSA-LABEL: name: load_2darraymsaa
   ; GFX10NSA: bb.1 (%ir-block.0):
   ; GFX10NSA-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -88,6 +89,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad
   ; GFX6-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX6-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX6-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX10NSA-LABEL: name: load_2darraymsaa_tfe
   ; GFX10NSA: bb.1 (%ir-block.0):
   ; GFX10NSA-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.3d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.3d.ll
index f15307563f7b..2c155b72c649 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.3d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.3d.ll
@@ -25,6 +25,7 @@ define amdgpu_ps float @image_load_3d_f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; GFX6-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 1, [[BUILD_VECTOR1]](<3 x s32>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (s32), addrspace 8)
   ; GFX6-NEXT:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX6-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX10NSA-LABEL: name: image_load_3d_f32
   ; GFX10NSA: bb.1 (%ir-block.0):
   ; GFX10NSA-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
@@ -72,6 +73,7 @@ define amdgpu_ps float @image_load_3d_tfe_f32(<8 x i32> inreg %rsrc, i32 %s, i32
   ; GFX6-NEXT:   G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; GFX6-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX6-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX10NSA-LABEL: name: image_load_3d_tfe_f32
   ; GFX10NSA: bb.1 (%ir-block.0):
   ; GFX10NSA-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.d.ll
index 0d7d3abd918c..241170b94318 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.d.ll
@@ -39,6 +39,7 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_d_3d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
@@ -74,6 +75,7 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_d_3d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
@@ -151,6 +153,7 @@ define amdgpu_ps <4 x float> @sample_c_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_c_d_3d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
@@ -187,6 +190,7 @@ define amdgpu_ps <4 x float> @sample_c_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_c_d_3d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
@@ -266,6 +270,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_3d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_c_d_cl_3d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
@@ -303,6 +308,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_3d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_c_d_cl_3d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
@@ -384,6 +390,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_o_3d(<8 x i32> inreg %rsrc, <4 x i32
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_c_d_cl_o_3d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
@@ -422,6 +429,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_o_3d(<8 x i32> inreg %rsrc, <4 x i32
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_c_d_cl_o_3d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.a16.ll
index 288c46f5f0f2..f05b258c974d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.a16.ll
@@ -39,6 +39,7 @@ define amdgpu_ps <4 x float> @sample_d_1d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_d_1d_g16_a16
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2
@@ -74,6 +75,7 @@ define amdgpu_ps <4 x float> @sample_d_1d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_d_1d_g16_a16
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2
@@ -155,6 +157,7 @@ define amdgpu_ps <4 x float> @sample_d_2d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_d_2d_g16_a16
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
@@ -195,6 +198,7 @@ define amdgpu_ps <4 x float> @sample_d_2d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_d_2d_g16_a16
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
@@ -292,6 +296,7 @@ define amdgpu_ps <4 x float> @sample_d_3d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_d_3d_g16_a16
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
@@ -343,6 +348,7 @@ define amdgpu_ps <4 x float> @sample_d_3d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_d_3d_g16_a16
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll
index b36b35937cf8..cc2a8ba9c4d5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll
@@ -38,6 +38,7 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_d_1d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2
@@ -72,6 +73,7 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_d_1d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2
@@ -151,6 +153,7 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_d_2d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
@@ -190,6 +193,7 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_d_2d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
@@ -284,6 +288,7 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_d_3d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
@@ -333,6 +338,7 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_d_3d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
@@ -424,6 +430,7 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_c_d_1d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -460,6 +467,7 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_c_d_1d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -543,6 +551,7 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_c_d_2d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
@@ -584,6 +593,7 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_c_d_2d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
@@ -668,6 +678,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_d_cl_1d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -704,6 +715,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_d_cl_1d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -787,6 +799,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_d_cl_2d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
@@ -828,6 +841,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_d_cl_2d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
@@ -914,6 +928,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_c_d_cl_1d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
@@ -952,6 +967,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_c_d_cl_1d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
@@ -1041,6 +1057,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_c_d_cl_2d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
@@ -1085,6 +1102,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_c_d_cl_2d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
@@ -1169,6 +1187,7 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_cd_1d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2
@@ -1203,6 +1222,7 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_cd_1d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2
@@ -1282,6 +1302,7 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_cd_2d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
@@ -1321,6 +1342,7 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_cd_2d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
@@ -1402,6 +1424,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_c_cd_1d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -1438,6 +1461,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_c_cd_1d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -1521,6 +1545,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_c_cd_2d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
@@ -1562,6 +1587,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_c_cd_2d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
@@ -1646,6 +1672,7 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_cd_cl_1d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -1682,6 +1709,7 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_cd_cl_1d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -1765,6 +1793,7 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_cd_cl_2d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
@@ -1806,6 +1835,7 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_cd_cl_2d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
@@ -1892,6 +1922,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_c_cd_cl_1d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
@@ -1930,6 +1961,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_c_cd_cl_1d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
@@ -2019,6 +2051,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_c_cd_cl_2d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
@@ -2063,6 +2096,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_c_cd_cl_2d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
@@ -2155,6 +2189,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 4, [[CONCAT_VECTORS]](<14 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (s32), addrspace 8)
   ; GFX10-NEXT:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX11-LABEL: name: sample_c_d_o_2darray_V1
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
@@ -2197,6 +2232,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 4, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (s32), addrspace 8)
   ; GFX11-NEXT:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX12-LABEL: name: sample_c_d_o_2darray_V1
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
@@ -2289,6 +2325,7 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
   ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+  ;
   ; GFX11-LABEL: name: sample_c_d_o_2darray_V2
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
@@ -2333,6 +2370,7 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
   ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+  ;
   ; GFX12-LABEL: name: sample_c_d_o_2darray_V2
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.id.ll
deleted file mode 100644
index fc22d5ec66f0..000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.id.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-
-declare i64 @llvm.amdgcn.dispatch.id() #1
-
-; GCN-LABEL: {{^}}dispatch_id:
-; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s6
-; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s7
-; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]]
-; GCN: .amdhsa_user_sgpr_dispatch_id 1
-define amdgpu_kernel void @dispatch_id(ptr addrspace(1) %out) #0 {
-  %tmp0 = call i64 @llvm.amdgcn.dispatch.id()
-  store i64 %tmp0, ptr addrspace(1) %out
-  ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
-
-!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll
index 7bde96d0be9b..c2799e5836a9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll
@@ -367,33 +367,61 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_
 
 ; Natural mapping
 define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, i64 %cmp, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-  ; CHECK-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
-  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
-  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
-  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  ; GFX8-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; GFX8: bb.1 (%ir-block.0):
+  ; GFX8-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX8-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX8-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX8-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX8-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX8-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX8-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX8-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; GFX8-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX8-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1
+  ; GFX8-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
+  ; GFX8-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
+  ; GFX8-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; GFX8-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX8-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  ;
+  ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX12-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX12-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; GFX12-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
+  ; GFX12-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; GFX12-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
   %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   %cast = bitcast i64 %ret to double
   ret double %cast
@@ -401,97 +429,184 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr
 
 ; Natural mapping
 define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, i64 %cmp, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-  ; CHECK-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_OFFEN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
-  ; CHECK-NEXT:   S_ENDPGM 0
+  ; GFX8-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; GFX8: bb.1 (%ir-block.0):
+  ; GFX8-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX8-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX8-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX8-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX8-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX8-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX8-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX8-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; GFX8-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_OFFEN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX8-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX12-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX12-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; GFX12-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX12-NEXT:   S_ENDPGM 0
   %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
 }
 
 ; All operands need regbank legalization
 define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i64 inreg %val, i64 inreg %cmp, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) {
-  ; CHECK-LABEL: name: raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
-  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
-  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
-  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
-  ; CHECK-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
-  ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
-  ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
-  ; CHECK-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1
-  ; CHECK-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
-  ; CHECK-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.4:
-  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub1
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec
-  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec
-  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
-  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  ; GFX8-LABEL: name: raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; GFX8: bb.1 (%ir-block.0):
+  ; GFX8-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX8-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX8-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX8-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX8-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX8-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX8-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX8-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; GFX8-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; GFX8-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; GFX8-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.2:
+  ; GFX8-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; GFX8-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+  ; GFX8-NEXT:   [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+  ; GFX8-NEXT:   [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+  ; GFX8-NEXT:   [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+  ; GFX8-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+  ; GFX8-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+  ; GFX8-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+  ; GFX8-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
+  ; GFX8-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; GFX8-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.3:
+  ; GFX8-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
+  ; GFX8-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX8-NEXT:   [[COPY17:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1
+  ; GFX8-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+  ; GFX8-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.4:
+  ; GFX8-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.5:
+  ; GFX8-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0
+  ; GFX8-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub1
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec
+  ; GFX8-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec
+  ; GFX8-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
+  ; GFX8-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  ;
+  ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.2:
+  ; GFX12-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; GFX12-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+  ; GFX12-NEXT:   [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+  ; GFX12-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+  ; GFX12-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+  ; GFX12-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+  ; GFX12-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
+  ; GFX12-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; GFX12-NEXT:   [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.3:
+  ; GFX12-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
+  ; GFX12-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1
+  ; GFX12-NEXT:   $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+  ; GFX12-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.4:
+  ; GFX12-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.5:
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub1
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec
+  ; GFX12-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec
+  ; GFX12-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
+  ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
   %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   %cast = bitcast i64 %ret to double
   ret double %cast
@@ -499,96 +614,183 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr
 
 ; All operands need regbank legalization
 define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i64 inreg %val, i64 inreg %cmp, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) {
-  ; CHECK-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
-  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
-  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
-  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
-  ; CHECK-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
-  ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
-  ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
-  ; CHECK-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_OFFEN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
-  ; CHECK-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
-  ; CHECK-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.4:
-  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   S_ENDPGM 0
+  ; GFX8-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; GFX8: bb.1 (%ir-block.0):
+  ; GFX8-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX8-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX8-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX8-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX8-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX8-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX8-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX8-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; GFX8-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; GFX8-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; GFX8-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.2:
+  ; GFX8-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; GFX8-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+  ; GFX8-NEXT:   [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+  ; GFX8-NEXT:   [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+  ; GFX8-NEXT:   [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+  ; GFX8-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+  ; GFX8-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+  ; GFX8-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+  ; GFX8-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
+  ; GFX8-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; GFX8-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.3:
+  ; GFX8-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
+  ; GFX8-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_OFFEN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX8-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+  ; GFX8-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.4:
+  ; GFX8-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.5:
+  ; GFX8-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.2:
+  ; GFX12-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; GFX12-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+  ; GFX12-NEXT:   [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+  ; GFX12-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+  ; GFX12-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+  ; GFX12-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+  ; GFX12-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
+  ; GFX12-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; GFX12-NEXT:   [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.3:
+  ; GFX12-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
+  ; GFX12-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX12-NEXT:   $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+  ; GFX12-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.4:
+  ; GFX12-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.5:
+  ; GFX12-NEXT:   S_ENDPGM 0
   %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
 }
 
 define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095(i64 %val, i64 %cmp, <4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-  ; CHECK-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
-  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
-  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
-  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  ; GFX8-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
+  ; GFX8: bb.1 (%ir-block.0):
+  ; GFX8-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX8-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX8-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX8-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX8-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX8-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX8-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX8-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; GFX8-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX8-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1
+  ; GFX8-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
+  ; GFX8-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
+  ; GFX8-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; GFX8-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX8-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  ;
+  ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX12-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX12-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; GFX12-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
+  ; GFX12-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; GFX12-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
   %voffset = add i32 %voffset.base, 4095
   %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   %cast = bitcast i64 %ret to double
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll
index 70d22df868be..f9f70ecadfe6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll
@@ -390,35 +390,65 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sg
 
 ; Natural mapping
 define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, i64 %cmp, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
-  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-  ; CHECK-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
-  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
-  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
-  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  ; GFX8-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; GFX8: bb.1 (%ir-block.0):
+  ; GFX8-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX8-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX8-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX8-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX8-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX8-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX8-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; GFX8-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX8-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+  ; GFX8-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; GFX8-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX8-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1
+  ; GFX8-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
+  ; GFX8-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; GFX8-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+  ; GFX8-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX8-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  ;
+  ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX12-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX12-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+  ; GFX12-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; GFX12-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; GFX12-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+  ; GFX12-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
   %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   %cast = bitcast i64 %ret to double
   ret double %cast
@@ -426,102 +456,194 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__s
 
 ; Natural mapping
 define amdgpu_ps void @struct_buffer_atomic_cmpswap_noret_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, i64 %cmp, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: struct_buffer_atomic_cmpswap_noret_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
-  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-  ; CHECK-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
-  ; CHECK-NEXT:   S_ENDPGM 0
+  ; GFX8-LABEL: name: struct_buffer_atomic_cmpswap_noret_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; GFX8: bb.1 (%ir-block.0):
+  ; GFX8-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX8-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX8-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX8-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX8-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX8-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX8-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; GFX8-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX8-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+  ; GFX8-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; GFX8-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX8-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_noret_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX12-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX12-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+  ; GFX12-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; GFX12-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX12-NEXT:   S_ENDPGM 0
   %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
 }
 
 ; All operands need legalization
 define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i64 inreg %val, i64 inreg %cmp, <4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) {
-  ; CHECK-LABEL: name: struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
-  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
-  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
-  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
-  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
-  ; CHECK-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
-  ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
-  ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
-  ; CHECK-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
-  ; CHECK-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1
-  ; CHECK-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
-  ; CHECK-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.4:
-  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0
-  ; CHECK-NEXT:   [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub1
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec
-  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec
-  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
-  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  ; GFX8-LABEL: name: struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; GFX8: bb.1 (%ir-block.0):
+  ; GFX8-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX8-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX8-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX8-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX8-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX8-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX8-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
+  ; GFX8-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX8-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; GFX8-NEXT:   [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; GFX8-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; GFX8-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+  ; GFX8-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.2:
+  ; GFX8-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; GFX8-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+  ; GFX8-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+  ; GFX8-NEXT:   [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+  ; GFX8-NEXT:   [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+  ; GFX8-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
+  ; GFX8-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
+  ; GFX8-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+  ; GFX8-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
+  ; GFX8-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; GFX8-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.3:
+  ; GFX8-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
+  ; GFX8-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
+  ; GFX8-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX8-NEXT:   [[COPY19:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1
+  ; GFX8-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+  ; GFX8-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.4:
+  ; GFX8-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.5:
+  ; GFX8-NEXT:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0
+  ; GFX8-NEXT:   [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub1
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec
+  ; GFX8-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec
+  ; GFX8-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
+  ; GFX8-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  ;
+  ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; GFX12-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+  ; GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.2:
+  ; GFX12-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; GFX12-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+  ; GFX12-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
+  ; GFX12-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
+  ; GFX12-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+  ; GFX12-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
+  ; GFX12-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; GFX12-NEXT:   [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.3:
+  ; GFX12-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
+  ; GFX12-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
+  ; GFX12-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1
+  ; GFX12-NEXT:   $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+  ; GFX12-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.4:
+  ; GFX12-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.5:
+  ; GFX12-NEXT:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub1
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec
+  ; GFX12-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec
+  ; GFX12-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
+  ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
   %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   %cast = bitcast i64 %ret to double
   ret double %cast
@@ -529,101 +651,193 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__v
 
 ; All operands need legalization
 define amdgpu_ps void @struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i64 inreg %val, i64 inreg %cmp, <4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) {
-  ; CHECK-LABEL: name: struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
-  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
-  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
-  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
-  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
-  ; CHECK-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
-  ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
-  ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
-  ; CHECK-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
-  ; CHECK-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
-  ; CHECK-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
-  ; CHECK-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.4:
-  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   S_ENDPGM 0
+  ; GFX8-LABEL: name: struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; GFX8: bb.1 (%ir-block.0):
+  ; GFX8-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX8-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX8-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX8-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX8-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX8-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX8-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
+  ; GFX8-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX8-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; GFX8-NEXT:   [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; GFX8-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; GFX8-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+  ; GFX8-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.2:
+  ; GFX8-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; GFX8-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+  ; GFX8-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+  ; GFX8-NEXT:   [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+  ; GFX8-NEXT:   [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+  ; GFX8-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
+  ; GFX8-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
+  ; GFX8-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+  ; GFX8-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
+  ; GFX8-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; GFX8-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.3:
+  ; GFX8-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
+  ; GFX8-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
+  ; GFX8-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX8-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+  ; GFX8-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.4:
+  ; GFX8-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.5:
+  ; GFX8-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; GFX12-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+  ; GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.2:
+  ; GFX12-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; GFX12-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+  ; GFX12-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
+  ; GFX12-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
+  ; GFX12-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+  ; GFX12-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
+  ; GFX12-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; GFX12-NEXT:   [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.3:
+  ; GFX12-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
+  ; GFX12-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
+  ; GFX12-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX12-NEXT:   $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+  ; GFX12-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.4:
+  ; GFX12-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.5:
+  ; GFX12-NEXT:   S_ENDPGM 0
   %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
 }
 
 define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095(i64 %val, i64 %cmp, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset.base, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
-  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-  ; CHECK-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
-  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
-  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
-  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  ; GFX8-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
+  ; GFX8: bb.1 (%ir-block.0):
+  ; GFX8-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX8-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX8-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX8-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX8-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX8-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX8-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; GFX8-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX8-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+  ; GFX8-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; GFX8-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX8-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1
+  ; GFX8-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
+  ; GFX8-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; GFX8-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+  ; GFX8-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX8-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  ;
+  ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX12-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX12-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+  ; GFX12-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; GFX12-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; GFX12-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+  ; GFX12-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
   %voffset = add i32 %voffset.base, 4095
   %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   %cast = bitcast i64 %ret to double
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
index 8c4ce1caa8d8..61263e0efa2e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
@@ -23,6 +23,7 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32
   ; FAST-NEXT:   [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY9]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
   ; FAST-NEXT:   S_ENDPGM 0
+  ;
   ; GREEDY-LABEL: name: load_1d_vgpr_vaddr__sgpr_srsrc
   ; GREEDY: bb.1 (%ir-block.0):
   ; GREEDY-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0
@@ -69,6 +70,7 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32
   ; FAST-NEXT:   [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY10]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
   ; FAST-NEXT:   S_ENDPGM 0
+  ;
   ; GREEDY-LABEL: name: load_1d_sgpr_vaddr__sgpr_srsrc
   ; GREEDY: bb.1 (%ir-block.0):
   ; GREEDY-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10
@@ -157,6 +159,7 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) {
   ; FAST-NEXT:   [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY9]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
   ; FAST-NEXT:   S_ENDPGM 0
+  ;
   ; GREEDY-LABEL: name: load_1d_vgpr_vaddr__vgpr_srsrc
   ; GREEDY: bb.1 (%ir-block.0):
   ; GREEDY-NEXT:   successors: %bb.2(0x80000000)
@@ -287,6 +290,7 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg
   ; FAST-NEXT:   [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY10]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
   ; FAST-NEXT:   S_ENDPGM 0
+  ;
   ; GREEDY-LABEL: name: load_1d_sgpr_vaddr__vgpr_srsrc
   ; GREEDY: bb.1 (%ir-block.0):
   ; GREEDY-NEXT:   successors: %bb.2(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
index 9833cd1318e5..d6a7ae8d867f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
@@ -27,6 +27,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inre
   ; FAST-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
   ; FAST-NEXT:   S_ENDPGM 0
+  ;
   ; GREEDY-LABEL: name: sample_1d_vgpr_vaddr__sgpr_rsrc__sgpr_samp
   ; GREEDY: bb.1 (%ir-block.0):
   ; GREEDY-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0
@@ -81,6 +82,7 @@ define amdgpu_ps void @sample_1d_sgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inre
   ; FAST-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
   ; FAST-NEXT:   S_ENDPGM 0
+  ;
   ; GREEDY-LABEL: name: sample_1d_sgpr_vaddr__sgpr_rsrc__sgpr_samp
   ; GREEDY: bb.1 (%ir-block.0):
   ; GREEDY-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14
@@ -177,6 +179,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr
   ; FAST-NEXT: bb.5:
   ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
   ; FAST-NEXT:   S_ENDPGM 0
+  ;
   ; GREEDY-LABEL: name: sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp
   ; GREEDY: bb.1 (%ir-block.0):
   ; GREEDY-NEXT:   successors: %bb.2(0x80000000)
@@ -306,6 +309,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre
   ; FAST-NEXT: bb.5:
   ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
   ; FAST-NEXT:   S_ENDPGM 0
+  ;
   ; GREEDY-LABEL: name: sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp
   ; GREEDY: bb.1 (%ir-block.0):
   ; GREEDY-NEXT:   successors: %bb.2(0x80000000)
@@ -447,6 +451,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr
   ; FAST-NEXT: bb.5:
   ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
   ; FAST-NEXT:   S_ENDPGM 0
+  ;
   ; GREEDY-LABEL: name: sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp
   ; GREEDY: bb.1 (%ir-block.0):
   ; GREEDY-NEXT:   successors: %bb.2(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
index 3176feaf67d8..63b8cb6ffcaa 100644
--- a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
@@ -57,11 +57,13 @@ body:             |
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0
+    ;
     ; GFX90A-LABEL: name: a_to_v
     ; GFX90A: liveins: $agpr0
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0
+    ;
     ; GFX940-LABEL: name: a_to_v
     ; GFX940: liveins: $agpr0
     ; GFX940-NEXT: {{  $}}
@@ -84,12 +86,14 @@ body:             |
     ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr0_agpr1
     ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit killed $agpr0_agpr1, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1
+    ;
     ; GFX90A-LABEL: name: a2_to_v2
     ; GFX90A: liveins: $agpr0_agpr1
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr0_agpr1
     ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit killed $agpr0_agpr1, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1
+    ;
     ; GFX940-LABEL: name: a2_to_v2
     ; GFX940: liveins: $agpr0_agpr1
     ; GFX940-NEXT: {{  $}}
@@ -114,6 +118,7 @@ body:             |
     ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2
     ; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2
+    ;
     ; GFX90A-LABEL: name: a3_to_v3
     ; GFX90A: liveins: $agpr0_agpr1_agpr2
     ; GFX90A-NEXT: {{  $}}
@@ -121,6 +126,7 @@ body:             |
     ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2
     ; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2
+    ;
     ; GFX940-LABEL: name: a3_to_v3
     ; GFX940: liveins: $agpr0_agpr1_agpr2
     ; GFX940-NEXT: {{  $}}
@@ -146,6 +152,7 @@ body:             |
     ; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; GFX908-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ;
     ; GFX90A-LABEL: name: a4_to_v4
     ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3
     ; GFX90A-NEXT: {{  $}}
@@ -154,6 +161,7 @@ body:             |
     ; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; GFX90A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ;
     ; GFX940-LABEL: name: a4_to_v4
     ; GFX940: liveins: $agpr0_agpr1_agpr2_agpr3
     ; GFX940-NEXT: {{  $}}
@@ -185,6 +193,7 @@ body:             |
     ; GFX908-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; GFX908-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ;
     ; GFX90A-LABEL: name: a8_to_v8
     ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; GFX90A-NEXT: {{  $}}
@@ -197,6 +206,7 @@ body:             |
     ; GFX90A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; GFX90A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ;
     ; GFX940-LABEL: name: a8_to_v8
     ; GFX940: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; GFX940-NEXT: {{  $}}
@@ -239,6 +249,7 @@ body:             |
     ; GFX908-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; GFX908-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ;
     ; GFX90A-LABEL: name: a16_to_v16
     ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; GFX90A-NEXT: {{  $}}
@@ -259,6 +270,7 @@ body:             |
     ; GFX90A-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; GFX90A-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ;
     ; GFX940-LABEL: name: a16_to_v16
     ; GFX940: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; GFX940-NEXT: {{  $}}
@@ -294,11 +306,13 @@ body:             |
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0
+    ;
     ; GFX90A-LABEL: name: v_to_a
     ; GFX90A: liveins: $vgpr0
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0
+    ;
     ; GFX940-LABEL: name: v_to_a
     ; GFX940: liveins: $vgpr0
     ; GFX940-NEXT: {{  $}}
@@ -320,12 +334,14 @@ body:             |
     ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $vgpr0_vgpr1
     ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1
+    ;
     ; GFX90A-LABEL: name: v2_to_a2
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $vgpr0_vgpr1
     ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1
+    ;
     ; GFX940-LABEL: name: v2_to_a2
     ; GFX940: liveins: $vgpr0_vgpr1
     ; GFX940-NEXT: {{  $}}
@@ -349,6 +365,7 @@ body:             |
     ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
     ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
+    ;
     ; GFX90A-LABEL: name: v3_to_a3
     ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX90A-NEXT: {{  $}}
@@ -356,6 +373,7 @@ body:             |
     ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
     ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
+    ;
     ; GFX940-LABEL: name: v3_to_a3
     ; GFX940: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX940-NEXT: {{  $}}
@@ -381,6 +399,7 @@ body:             |
     ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
+    ;
     ; GFX90A-LABEL: name: v4_to_a4
     ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX90A-NEXT: {{  $}}
@@ -389,6 +408,7 @@ body:             |
     ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
+    ;
     ; GFX940-LABEL: name: v4_to_a4
     ; GFX940: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX940-NEXT: {{  $}}
@@ -419,6 +439,7 @@ body:             |
     ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ;
     ; GFX90A-LABEL: name: v8_to_a8
     ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX90A-NEXT: {{  $}}
@@ -431,6 +452,7 @@ body:             |
     ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ;
     ; GFX940-LABEL: name: v8_to_a8
     ; GFX940: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX940-NEXT: {{  $}}
@@ -473,6 +495,7 @@ body:             |
     ; GFX908-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; GFX908-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ;
     ; GFX90A-LABEL: name: v16_to_a16
     ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; GFX90A-NEXT: {{  $}}
@@ -493,6 +516,7 @@ body:             |
     ; GFX90A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ;
     ; GFX940-LABEL: name: v16_to_a16
     ; GFX940: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; GFX940-NEXT: {{  $}}
@@ -529,11 +553,13 @@ body:             |
     ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr0, implicit $exec
     ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0
+    ;
     ; GFX90A-LABEL: name: s_to_a
     ; GFX90A: liveins: $sgpr0
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0
+    ;
     ; GFX940-LABEL: name: s_to_a
     ; GFX940: liveins: $sgpr0
     ; GFX940-NEXT: {{  $}}
@@ -557,12 +583,14 @@ body:             |
     ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1
     ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1
+    ;
     ; GFX90A-LABEL: name: s2_to_a2
     ; GFX90A: liveins: $sgpr0_sgpr1
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $sgpr0_sgpr1
     ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1
+    ;
     ; GFX940-LABEL: name: s2_to_a2
     ; GFX940: liveins: $sgpr0_sgpr1
     ; GFX940-NEXT: {{  $}}
@@ -589,6 +617,7 @@ body:             |
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2
     ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
+    ;
     ; GFX90A-LABEL: name: s3_to_a3
     ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2
     ; GFX90A-NEXT: {{  $}}
@@ -596,6 +625,7 @@ body:             |
     ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
     ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
+    ;
     ; GFX940-LABEL: name: s3_to_a3
     ; GFX940: liveins: $sgpr0_sgpr1_sgpr2
     ; GFX940-NEXT: {{  $}}
@@ -625,6 +655,7 @@ body:             |
     ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
+    ;
     ; GFX90A-LABEL: name: s4_to_a4
     ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX90A-NEXT: {{  $}}
@@ -633,6 +664,7 @@ body:             |
     ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
+    ;
     ; GFX940-LABEL: name: s4_to_a4
     ; GFX940: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX940-NEXT: {{  $}}
@@ -667,6 +699,7 @@ body:             |
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
     ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ;
     ; GFX90A-LABEL: name: s6_to_a6
     ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
     ; GFX90A-NEXT: {{  $}}
@@ -677,6 +710,7 @@ body:             |
     ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
     ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ;
     ; GFX940-LABEL: name: s6_to_a6
     ; GFX940: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
     ; GFX940-NEXT: {{  $}}
@@ -717,6 +751,7 @@ body:             |
     ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ;
     ; GFX90A-LABEL: name: s8_to_a8
     ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX90A-NEXT: {{  $}}
@@ -729,6 +764,7 @@ body:             |
     ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ;
     ; GFX940-LABEL: name: s8_to_a8
     ; GFX940: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX940-NEXT: {{  $}}
@@ -787,6 +823,7 @@ body:             |
     ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
     ; GFX908-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ;
     ; GFX90A-LABEL: name: s16_to_a16
     ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
     ; GFX90A-NEXT: {{  $}}
@@ -807,6 +844,7 @@ body:             |
     ; GFX90A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
     ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ;
     ; GFX940-LABEL: name: s16_to_a16
     ; GFX940: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
     ; GFX940-NEXT: {{  $}}
@@ -841,10 +879,12 @@ body:             |
     ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
     ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0
+    ;
     ; GFX90A-LABEL: name: a_to_a
     ; GFX90A: $agpr1 = IMPLICIT_DEF
     ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0
+    ;
     ; GFX940-LABEL: name: a_to_a
     ; GFX940: $agpr1 = IMPLICIT_DEF
     ; GFX940-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec, implicit $exec
@@ -869,6 +909,7 @@ body:             |
     ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
     ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3
+    ;
     ; GFX90A-LABEL: name: a2_to_a2_kill
     ; GFX90A: liveins: $agpr0_agpr1
     ; GFX90A-NEXT: {{  $}}
@@ -876,6 +917,7 @@ body:             |
     ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1, implicit $exec
     ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3
+    ;
     ; GFX940-LABEL: name: a2_to_a2_kill
     ; GFX940: liveins: $agpr0_agpr1
     ; GFX940-NEXT: {{  $}}
@@ -905,6 +947,7 @@ body:             |
     ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr3_agpr4
     ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit killed $agpr1_agpr2
     ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec
+    ;
     ; GFX90A-LABEL: name: a2_to_a2_implicit_defs
     ; GFX90A: liveins: $agpr0_agpr1
     ; GFX90A-NEXT: {{  $}}
@@ -914,6 +957,7 @@ body:             |
     ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec, implicit-def $agpr1_agpr2
     ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit-def $agpr3_agpr4, implicit $agpr1_agpr2
     ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit killed $agpr1_agpr2, implicit $exec
+    ;
     ; GFX940-LABEL: name: a2_to_a2_implicit_defs
     ; GFX940: liveins: $agpr0_agpr1
     ; GFX940-NEXT: {{  $}}
@@ -946,6 +990,7 @@ body:             |
     ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit killed $agpr4_agpr5_agpr6
     ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
+    ;
     ; GFX90A-LABEL: name: a3_to_a3_nonoverlap_kill
     ; GFX90A: liveins: $agpr4_agpr5_agpr6
     ; GFX90A-NEXT: {{  $}}
@@ -953,6 +998,7 @@ body:             |
     ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec, implicit $agpr4_agpr5_agpr6
     ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit killed $agpr4_agpr5_agpr6
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
+    ;
     ; GFX940-LABEL: name: a3_to_a3_nonoverlap_kill
     ; GFX940: liveins: $agpr4_agpr5_agpr6
     ; GFX940-NEXT: {{  $}}
@@ -981,6 +1027,7 @@ body:             |
     ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
     ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2, implicit $vgpr1
+    ;
     ; GFX90A-LABEL: name: a3_to_a3_overlap_kill
     ; GFX90A: liveins: $agpr1_agpr2_agpr3
     ; GFX90A-NEXT: {{  $}}
@@ -989,6 +1036,7 @@ body:             |
     ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr1_agpr2_agpr3
     ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2, implicit $vgpr1
+    ;
     ; GFX940-LABEL: name: a3_to_a3_overlap_kill
     ; GFX940: liveins: $agpr1_agpr2_agpr3
     ; GFX940-NEXT: {{  $}}
@@ -1018,6 +1066,7 @@ body:             |
     ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5
+    ;
     ; GFX90A-LABEL: name: a4_to_a4
     ; GFX90A: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
     ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3
@@ -1025,6 +1074,7 @@ body:             |
     ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5
+    ;
     ; GFX940-LABEL: name: a4_to_a4
     ; GFX940: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
     ; GFX940-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3
@@ -1055,6 +1105,7 @@ body:             |
     ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5
+    ;
     ; GFX90A-LABEL: name: a4_to_a4_overlap
     ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3
     ; GFX90A-NEXT: {{  $}}
@@ -1063,6 +1114,7 @@ body:             |
     ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5
+    ;
     ; GFX940-LABEL: name: a4_to_a4_overlap
     ; GFX940: liveins: $agpr0_agpr1_agpr2_agpr3
     ; GFX940-NEXT: {{  $}}
@@ -1099,6 +1151,7 @@ body:             |
     ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; GFX908-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ;
     ; GFX90A-LABEL: name: a8_to_a8
     ; GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
     ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_MOV_B32 $agpr7, implicit $exec, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
@@ -1110,6 +1163,7 @@ body:             |
     ; GFX90A-NEXT: $agpr9 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; GFX90A-NEXT: $agpr8 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ;
     ; GFX940-LABEL: name: a8_to_a8
     ; GFX940: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
     ; GFX940-NEXT: $agpr15 = V_ACCVGPR_MOV_B32 $agpr7, implicit $exec, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
@@ -1167,6 +1221,7 @@ body:             |
     ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; GFX908-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ;
     ; GFX90A-LABEL: name: a16_to_a16
     ; GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
     ; GFX90A-NEXT: $agpr31 = V_ACCVGPR_MOV_B32 $agpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
@@ -1186,6 +1241,7 @@ body:             |
     ; GFX90A-NEXT: $agpr17 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; GFX90A-NEXT: $agpr16 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ;
     ; GFX940-LABEL: name: a16_to_a16
     ; GFX940: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
     ; GFX940-NEXT: $agpr31 = V_ACCVGPR_MOV_B32 $agpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
@@ -1226,12 +1282,14 @@ body:             |
     ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
     ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0
+    ;
     ; GFX90A-LABEL: name: a_to_a_spill
     ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $agpr1 = IMPLICIT_DEF
     ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0
+    ;
     ; GFX940-LABEL: name: a_to_a_spill
     ; GFX940: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254
     ; GFX940-NEXT: {{  $}}
@@ -1263,6 +1321,7 @@ body:             |
     ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ;
     ; GFX90A-LABEL: name: copy_sgpr_to_agpr_tuple
     ; GFX90A: liveins: $agpr0, $sgpr2_sgpr3
     ; GFX90A-NEXT: {{  $}}
@@ -1272,6 +1331,7 @@ body:             |
     ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ;
     ; GFX940-LABEL: name: copy_sgpr_to_agpr_tuple
     ; GFX940: liveins: $agpr0, $sgpr2_sgpr3
     ; GFX940-NEXT: {{  $}}
@@ -1305,6 +1365,7 @@ body:             |
     ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7
+    ;
     ; GFX90A-LABEL: name: copy_sgpr_to_agpr_tuple_kill
     ; GFX90A: liveins: $agpr0, $sgpr2_sgpr3
     ; GFX90A-NEXT: {{  $}}
@@ -1314,6 +1375,7 @@ body:             |
     ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7
+    ;
     ; GFX940-LABEL: name: copy_sgpr_to_agpr_tuple_kill
     ; GFX940: liveins: $agpr0, $sgpr2_sgpr3
     ; GFX940-NEXT: {{  $}}
@@ -1348,6 +1410,7 @@ body:             |
     ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3
+    ;
     ; GFX90A-LABEL: name: copy_agpr_to_agpr_tuple
     ; GFX90A: liveins: $agpr0, $agpr2_agpr3
     ; GFX90A-NEXT: {{  $}}
@@ -1357,6 +1420,7 @@ body:             |
     ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3
+    ;
     ; GFX940-LABEL: name: copy_agpr_to_agpr_tuple
     ; GFX940: liveins: $agpr0, $agpr2_agpr3
     ; GFX940-NEXT: {{  $}}
@@ -1391,6 +1455,7 @@ body:             |
     ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3
     ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7
+    ;
     ; GFX90A-LABEL: name: copy_agpr_to_agpr_tuple_kill
     ; GFX90A: liveins: $agpr0, $agpr2_agpr3
     ; GFX90A-NEXT: {{  $}}
@@ -1400,6 +1465,7 @@ body:             |
     ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7
+    ;
     ; GFX940-LABEL: name: copy_agpr_to_agpr_tuple_kill
     ; GFX940: liveins: $agpr0, $agpr2_agpr3
     ; GFX940-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir b/llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir
index 53540e4a0004..9794130d2b00 100644
--- a/llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir
+++ b/llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir
@@ -39,6 +39,7 @@ body:             |
   ; GFX908-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX90A-LABEL: name: agpr32_restore_clobber_scc
   ; GFX90A: bb.0:
   ; GFX90A-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -509,6 +510,7 @@ body:             |
   ; GFX90A-NEXT:   $agpr33 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX90A-NEXT:   $agpr32 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
   ; GFX90A-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX908-FLATSCR-LABEL: name: agpr32_restore_clobber_scc
   ; GFX908-FLATSCR: bb.0:
   ; GFX908-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -532,6 +534,7 @@ body:             |
   ; GFX908-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX90A-FLATSCR-LABEL: name: agpr32_restore_clobber_scc
   ; GFX90A-FLATSCR: bb.0:
   ; GFX90A-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1056,6 +1059,7 @@ body:             |
   ; GFX908-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX90A-LABEL: name: agpr64_restore_clobber_scc
   ; GFX90A: bb.0:
   ; GFX90A-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1527,6 +1531,7 @@ body:             |
   ; GFX90A-NEXT:   $agpr33 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX90A-NEXT:   $agpr32 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
   ; GFX90A-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX908-FLATSCR-LABEL: name: agpr64_restore_clobber_scc
   ; GFX908-FLATSCR: bb.0:
   ; GFX908-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1552,6 +1557,7 @@ body:             |
   ; GFX908-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX90A-FLATSCR-LABEL: name: agpr64_restore_clobber_scc
   ; GFX90A-FLATSCR: bb.0:
   ; GFX90A-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -2078,6 +2084,7 @@ body:             |
   ; GFX908-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX90A-LABEL: name: agpr96_restore_clobber_scc
   ; GFX90A: bb.0:
   ; GFX90A-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -2550,6 +2557,7 @@ body:             |
   ; GFX90A-NEXT:   $agpr33 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX90A-NEXT:   $agpr32 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
   ; GFX90A-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX908-FLATSCR-LABEL: name: agpr96_restore_clobber_scc
   ; GFX908-FLATSCR: bb.0:
   ; GFX908-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -2577,6 +2585,7 @@ body:             |
   ; GFX908-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX90A-FLATSCR-LABEL: name: agpr96_restore_clobber_scc
   ; GFX90A-FLATSCR: bb.0:
   ; GFX90A-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -3099,6 +3108,7 @@ body:             |
   ; GFX908-NEXT:   liveins: $agpr0, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX90A-LABEL: name: agpr32_save_clobber_scc
   ; GFX90A: bb.0:
   ; GFX90A-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -3569,6 +3579,7 @@ body:             |
   ; GFX90A-NEXT:   $agpr33 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX90A-NEXT:   $agpr32 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
   ; GFX90A-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX908-FLATSCR-LABEL: name: agpr32_save_clobber_scc
   ; GFX908-FLATSCR: bb.0:
   ; GFX908-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -3592,6 +3603,7 @@ body:             |
   ; GFX908-FLATSCR-NEXT:   liveins: $agpr0, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX90A-FLATSCR-LABEL: name: agpr32_save_clobber_scc
   ; GFX90A-FLATSCR: bb.0:
   ; GFX90A-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -4115,6 +4127,7 @@ body:             |
   ; GFX908-NEXT:   liveins: $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX90A-LABEL: name: agpr64_save_clobber_scc
   ; GFX90A: bb.0:
   ; GFX90A-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -4586,6 +4599,7 @@ body:             |
   ; GFX90A-NEXT:   $agpr33 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX90A-NEXT:   $agpr32 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
   ; GFX90A-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX908-FLATSCR-LABEL: name: agpr64_save_clobber_scc
   ; GFX908-FLATSCR: bb.0:
   ; GFX908-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -4611,6 +4625,7 @@ body:             |
   ; GFX908-FLATSCR-NEXT:   liveins: $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX90A-FLATSCR-LABEL: name: agpr64_save_clobber_scc
   ; GFX90A-FLATSCR: bb.0:
   ; GFX90A-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -5135,6 +5150,7 @@ body:             |
   ; GFX908-NEXT:   liveins: $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX90A-LABEL: name: agpr96_save_clobber_scc
   ; GFX90A: bb.0:
   ; GFX90A-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -5607,6 +5623,7 @@ body:             |
   ; GFX90A-NEXT:   $agpr33 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX90A-NEXT:   $agpr32 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
   ; GFX90A-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX908-FLATSCR-LABEL: name: agpr96_save_clobber_scc
   ; GFX908-FLATSCR: bb.0:
   ; GFX908-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -5634,6 +5651,7 @@ body:             |
   ; GFX908-FLATSCR-NEXT:   liveins: $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX90A-FLATSCR-LABEL: name: agpr96_save_clobber_scc
   ; GFX90A-FLATSCR: bb.0:
   ; GFX90A-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-vgprs.mir b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-vgprs.mir
index ec8533170ebf..950382758ffb 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-vgprs.mir
@@ -15,6 +15,7 @@ body: |
     ; GFX908-NEXT: $vgpr63 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
     ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, implicit $agpr1
+    ;
     ; GFX90A-LABEL: name: no_free_vgprs_for_copy_a32_to_a32
     ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $agpr0
     ; GFX90A-NEXT: {{  $}}
@@ -38,6 +39,7 @@ body: |
     ; GFX908-NEXT: $vgpr63 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1
     ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, implicit $agpr2_agpr3
+    ;
     ; GFX90A-LABEL: name: no_free_vgprs_for_copy_a64_to_a64
     ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $agpr0_agpr1
     ; GFX90A-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-propagation.mir b/llvm/test/CodeGen/AMDGPU/agpr-copy-propagation.mir
index 7eb3862764a4..a42cf43fe56f 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-propagation.mir
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-propagation.mir
@@ -16,6 +16,7 @@ body: |
     ; GFX908-NEXT: renamable $agpr2 = COPY $agpr0, implicit $exec
     ; GFX908-NEXT: renamable $agpr3 = COPY $agpr0, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3
+    ;
     ; GFX90A-LABEL: name: propagate_agpr
     ; GFX90A: liveins: $agpr0
     ; GFX90A-NEXT: {{  $}}
@@ -42,6 +43,7 @@ body: |
     ; GFX908-NEXT: renamable $agpr1 = COPY renamable $vgpr0, implicit $exec
     ; GFX908-NEXT: renamable $agpr2 = COPY renamable $vgpr0, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0, implicit $agpr1, implicit $agpr2
+    ;
     ; GFX90A-LABEL: name: do_not_propagate_agpr_to_agpr
     ; GFX90A: liveins: $agpr0
     ; GFX90A-NEXT: {{  $}}
@@ -68,6 +70,7 @@ body: |
     ; GFX908-NEXT: renamable $agpr1 = COPY $vgpr0, implicit $exec
     ; GFX908-NEXT: renamable $agpr2 = COPY $vgpr0, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2
+    ;
     ; GFX90A-LABEL: name: propagate_vgpr_to_agpr
     ; GFX90A: liveins: $vgpr0
     ; GFX90A-NEXT: {{  $}}
@@ -94,6 +97,7 @@ body: |
     ; GFX908-NEXT: renamable $vgpr1 = COPY $agpr0, implicit $exec
     ; GFX908-NEXT: renamable $vgpr2 = COPY $agpr0, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ;
     ; GFX90A-LABEL: name: propagate_agpr_to_vgpr
     ; GFX90A: liveins: $agpr0
     ; GFX90A-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-sgpr-no-vgprs.mir b/llvm/test/CodeGen/AMDGPU/agpr-copy-sgpr-no-vgprs.mir
index dc54aff6bd63..a9d31c1c45b0 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-sgpr-no-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-sgpr-no-vgprs.mir
@@ -15,6 +15,7 @@ body: |
     ; GFX908-NEXT: $vgpr63 = V_MOV_B32_e32 $sgpr8, implicit $exec
     ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, implicit $agpr1
+    ;
     ; GFX90A-LABEL: name: no_free_vgprs_for_copy_s32_to_a32
     ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8
     ; GFX90A-NEXT: {{  $}}
@@ -39,6 +40,7 @@ body: |
     ; GFX908-NEXT: $vgpr63 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9
     ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, implicit $agpr2_agpr3
+    ;
     ; GFX90A-LABEL: name: no_free_vgprs_for_copy_s64_to_a64
     ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8_sgpr9
     ; GFX90A-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-to-agpr-copy.mir b/llvm/test/CodeGen/AMDGPU/agpr-to-agpr-copy.mir
index f3f986a176d6..ffa9e643409d 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-to-agpr-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/agpr-to-agpr-copy.mir
@@ -15,13 +15,13 @@ body: |
     ; GFX908-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
     ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1123418112, implicit $exec
-    ; GFX908-NEXT: undef %4.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 [[V_MOV_B32_e32_1]], implicit $exec
-    ; GFX908-NEXT: %4.sub1:areg_128 = COPY [[V_MOV_B32_e32_1]]
-    ; GFX908-NEXT: %4.sub2:areg_128 = COPY [[V_MOV_B32_e32_1]]
-    ; GFX908-NEXT: %4.sub3:areg_128 = COPY [[V_MOV_B32_e32_1]]
+    ; GFX908-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 [[V_MOV_B32_e32_1]], implicit $exec
+    ; GFX908-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub1:areg_128 = COPY [[V_MOV_B32_e32_1]]
+    ; GFX908-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub2:areg_128 = COPY [[V_MOV_B32_e32_1]]
+    ; GFX908-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub3:areg_128 = COPY [[V_MOV_B32_e32_1]]
     ; GFX908-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec
     ; GFX908-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
-    ; GFX908-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[V_MOV_B32_e32_3]], [[V_MOV_B32_e32_2]], %4, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX908-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[V_MOV_B32_e32_3]], [[V_MOV_B32_e32_2]], [[V_ACCVGPR_WRITE_B32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_F32_4X4X1F32_e64_]]
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_]], [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; GFX908-NEXT: S_ENDPGM 0
@@ -53,13 +53,13 @@ body: |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
     ; GFX908-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
     ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX908-NEXT: undef %3.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 1073741824, implicit $exec
-    ; GFX908-NEXT: %3.sub1:areg_128 = COPY %3.sub0
-    ; GFX908-NEXT: %3.sub2:areg_128 = COPY %3.sub0
-    ; GFX908-NEXT: %3.sub3:areg_128 = COPY %3.sub0
+    ; GFX908-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 1073741824, implicit $exec
+    ; GFX908-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub1:areg_128 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+    ; GFX908-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub2:areg_128 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+    ; GFX908-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub3:areg_128 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
     ; GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec
     ; GFX908-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
-    ; GFX908-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[V_MOV_B32_e32_2]], [[V_MOV_B32_e32_1]], %3, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX908-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[V_MOV_B32_e32_2]], [[V_MOV_B32_e32_1]], [[V_ACCVGPR_WRITE_B32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_F32_4X4X1F32_e64_]]
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_]], [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; GFX908-NEXT: S_ENDPGM 0
@@ -88,11 +88,11 @@ body: |
     ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
-    ; GFX908-NEXT: undef %1.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub0, implicit $exec
-    ; GFX908-NEXT: %1.sub1:areg_128 = COPY [[COPY]].sub0
-    ; GFX908-NEXT: %1.sub2:areg_128 = COPY [[COPY]].sub0
-    ; GFX908-NEXT: %1.sub3:areg_128 = COPY [[COPY]].sub0
-    ; GFX908-NEXT: S_ENDPGM 0, implicit [[COPY]], implicit %1
+    ; GFX908-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub0, implicit $exec
+    ; GFX908-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0
+    ; GFX908-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0
+    ; GFX908-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0
+    ; GFX908-NEXT: S_ENDPGM 0, implicit [[COPY]], implicit [[V_ACCVGPR_WRITE_B32_e64_]]
     %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
     undef %1.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 %0.sub0, implicit $exec
     %1.sub1:areg_128 = COPY %1.sub0:areg_128
@@ -111,10 +111,10 @@ body: |
     ; GFX908: liveins: $vgpr0_vgpr1
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1, implicit $exec
-    ; GFX908-NEXT: undef %1.sub0:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub0, implicit $exec
-    ; GFX908-NEXT: %1.sub1:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub1, implicit $exec
-    ; GFX908-NEXT: [[COPY1:%[0-9]+]]:areg_64 = COPY %1
-    ; GFX908-NEXT: S_ENDPGM 0, implicit [[COPY]], implicit %1, implicit [[COPY1]]
+    ; GFX908-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub0, implicit $exec
+    ; GFX908-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub1:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub1, implicit $exec
+    ; GFX908-NEXT: [[COPY1:%[0-9]+]]:areg_64 = COPY [[V_ACCVGPR_WRITE_B32_e64_]]
+    ; GFX908-NEXT: S_ENDPGM 0, implicit [[COPY]], implicit [[V_ACCVGPR_WRITE_B32_e64_]], implicit [[COPY1]]
     %0:vreg_64 = COPY $vgpr0_vgpr1, implicit $exec
     undef %1.sub0:areg_64 = V_ACCVGPR_WRITE_B32_e64 %0.sub0, implicit $exec
     %1.sub1:areg_64 = V_ACCVGPR_WRITE_B32_e64 %0.sub1, implicit $exec
@@ -132,10 +132,10 @@ body: |
     ; GFX908: liveins: $vgpr0_vgpr1
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1, implicit $exec
-    ; GFX908-NEXT: undef %1.sub0:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub0, implicit $exec
-    ; GFX908-NEXT: %1.sub1:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub1, implicit $exec
+    ; GFX908-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub0, implicit $exec
+    ; GFX908-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub1:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub1, implicit $exec
     ; GFX908-NEXT: [[COPY1:%[0-9]+]]:agpr_32 = COPY [[COPY]].sub1
-    ; GFX908-NEXT: S_ENDPGM 0, implicit [[COPY]], implicit %1, implicit [[COPY1]]
+    ; GFX908-NEXT: S_ENDPGM 0, implicit [[COPY]], implicit [[V_ACCVGPR_WRITE_B32_e64_]], implicit [[COPY1]]
     %0:vreg_64 = COPY $vgpr0_vgpr1, implicit $exec
     undef %1.sub0:areg_64 = V_ACCVGPR_WRITE_B32_e64 %0.sub0, implicit $exec
     %1.sub1:areg_64 = V_ACCVGPR_WRITE_B32_e64 %0.sub1, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
index 4061857789ed..5c56276eeb0f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
@@ -991,7 +991,7 @@ define void @sincos_f32_preserve_fpmath_0(float %x, ptr addrspace(1) nocapture w
 ; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
-; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]]), !fpmath !5
+; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]]), !fpmath [[META5:![0-9]+]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
 ; CHECK-NEXT:    store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4
 ; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4
@@ -1010,7 +1010,7 @@ define void @sincos_f32_preserve_fpmath_1(float %x, ptr addrspace(1) nocapture w
 ; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
-; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]]), !fpmath !6
+; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]]), !fpmath [[META6:![0-9]+]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
 ; CHECK-NEXT:    store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4
 ; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4
@@ -1051,9 +1051,9 @@ define void @sincos_f32_debuginfo(float %x, ptr addrspace(1) nocapture writeonly
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5), !dbg [[DBG14:![0-9]+]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]]), !dbg [[DBG14]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4, !dbg [[DBG14]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata float [[TMP0]], metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG15:![0-9]+]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata float [[TMP0]], metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG15:![0-9]+]]
 ; CHECK-NEXT:    store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4, !dbg [[DBG16:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata float [[TMP1]], metadata [[META13:![0-9]+]], metadata !DIExpression()), !dbg [[DBG17:![0-9]+]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata float [[TMP1]], metadata [[META13:![0-9]+]], metadata !DIExpression()), !dbg [[DBG17:![0-9]+]]
 ; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4, !dbg [[DBG18:![0-9]+]]
 ; CHECK-NEXT:    ret void, !dbg [[DBG19:![0-9]+]]
 ;
@@ -1072,9 +1072,9 @@ define float @sin_sincos_private_f32(float %x, ptr addrspace(1) %sin_out, ptr ad
 ; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COS_TMP:%.*]] = alloca float, align 4, addrspace(5)
-; CHECK-NEXT:    [[SIN0:%.*]] = tail call nnan ninf nsz contract float @_Z3sinf(float [[X]]), !fpmath !5
+; CHECK-NEXT:    [[SIN0:%.*]] = tail call nnan ninf nsz contract float @_Z3sinf(float [[X]]), !fpmath [[META5]]
 ; CHECK-NEXT:    store float [[SIN0]], ptr addrspace(1) [[SIN_OUT]], align 4
-; CHECK-NEXT:    [[SIN1:%.*]] = call nnan contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[COS_TMP]]), !fpmath !6
+; CHECK-NEXT:    [[SIN1:%.*]] = call nnan contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[COS_TMP]]), !fpmath [[META6]]
 ; CHECK-NEXT:    [[COS1:%.*]] = load float, ptr addrspace(5) [[COS_TMP]], align 4
 ; CHECK-NEXT:    store float [[COS1]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret float [[SIN1]]
@@ -1094,10 +1094,10 @@ define float @sin_sincos_generic_f32(float %x, ptr addrspace(1) %sin_out, ptr ad
 ; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COS_TMP:%.*]] = alloca float, align 4, addrspace(5)
-; CHECK-NEXT:    [[SIN0:%.*]] = tail call nsz contract float @_Z3sinf(float [[X]]), !fpmath !5
+; CHECK-NEXT:    [[SIN0:%.*]] = tail call nsz contract float @_Z3sinf(float [[X]]), !fpmath [[META5]]
 ; CHECK-NEXT:    store float [[SIN0]], ptr addrspace(1) [[SIN_OUT]], align 4
 ; CHECK-NEXT:    [[COS_TMP_CAST:%.*]] = addrspacecast ptr addrspace(5) [[COS_TMP]] to ptr
-; CHECK-NEXT:    [[SIN1:%.*]] = call ninf nsz contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[COS_TMP_CAST]]), !fpmath !6
+; CHECK-NEXT:    [[SIN1:%.*]] = call ninf nsz contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[COS_TMP_CAST]]), !fpmath [[META6]]
 ; CHECK-NEXT:    [[COS1:%.*]] = load float, ptr addrspace(5) [[COS_TMP]], align 4
 ; CHECK-NEXT:    store float [[COS1]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret float [[SIN1]]
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 8ec7dfd93cd0..ebb77c13c4af 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -32379,7 +32379,7 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_uitofp_i16_to_bf16:
@@ -32387,7 +32387,7 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uitofp_i16_to_bf16:
@@ -32455,8 +32455,8 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_uitofp_v2i16_to_v2bf16:
@@ -32466,8 +32466,8 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uitofp_v2i16_to_v2bf16:
@@ -32566,9 +32566,9 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_uitofp_v3i16_to_v3bf16:
@@ -32580,9 +32580,9 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uitofp_v3i16_to_v3bf16:
@@ -32682,10 +32682,10 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0x7fff0000, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_uitofp_v4i16_to_v4bf16:
@@ -32699,10 +32699,10 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0x7fff0000, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uitofp_v4i16_to_v4bf16:
@@ -32857,14 +32857,14 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_uitofp_i32_to_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uitofp_i32_to_bf16:
@@ -32928,8 +32928,8 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_uitofp_v2i32_to_v2bf16:
@@ -32937,8 +32937,8 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uitofp_v2i32_to_v2bf16:
@@ -33031,9 +33031,9 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_uitofp_v3i32_to_v3bf16:
@@ -33042,9 +33042,9 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uitofp_v3i32_to_v3bf16:
@@ -33140,10 +33140,10 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0x7fff0000, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_uitofp_v4i32_to_v4bf16:
@@ -33153,10 +33153,10 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0x7fff0000, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uitofp_v4i32_to_v4bf16:
@@ -37615,266 +37615,283 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GCN-LABEL: v_vselect_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_and_b32_e32 v5, 1, v5
+; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GCN-NEXT:    v_and_b32_e32 v36, 1, v13
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:48
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:176
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:180
 ; GCN-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:56
 ; GCN-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:184
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:188
 ; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:64
 ; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:192
+; GCN-NEXT:    v_and_b32_e32 v53, 1, v26
+; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:84
+; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:88
+; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:92
+; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:96
+; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:100
+; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:104
+; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:108
+; GCN-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:112
+; GCN-NEXT:    v_and_b32_e32 v27, 1, v27
+; GCN-NEXT:    v_and_b32_e32 v28, 1, v28
 ; GCN-NEXT:    v_and_b32_e32 v29, 1, v29
 ; GCN-NEXT:    v_and_b32_e32 v30, 1, v30
-; GCN-NEXT:    v_and_b32_e32 v48, 1, v28
-; GCN-NEXT:    v_and_b32_e32 v50, 1, v27
-; GCN-NEXT:    v_and_b32_e32 v52, 1, v26
-; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:92
-; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:220
-; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:96
-; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:224
-; GCN-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:104
-; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:232
-; GCN-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:108
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:236
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:112
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:240
-; GCN-NEXT:    s_waitcnt expcnt(6)
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:120
-; GCN-NEXT:    s_waitcnt expcnt(5)
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:248
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:116
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:120
 ; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124
-; GCN-NEXT:    s_waitcnt expcnt(4)
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:252
-; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:128
-; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:256
+; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:252
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:248
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:244
+; GCN-NEXT:    s_waitcnt expcnt(6)
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:240
 ; GCN-NEXT:    s_waitcnt vmcnt(14)
-; GCN-NEXT:    v_mul_f32_e32 v41, 1.0, v37
-; GCN-NEXT:    v_mul_f32_e32 v42, 1.0, v38
+; GCN-NEXT:    v_mul_f32_e32 v40, 1.0, v37
+; GCN-NEXT:    v_mul_f32_e32 v38, 1.0, v38
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v36
-; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    s_waitcnt vmcnt(5)
 ; GCN-NEXT:    v_mul_f32_e32 v36, 1.0, v43
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_mul_f32_e32 v37, 1.0, v56
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_mul_f32_e32 v37, 1.0, v44
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v30
 ; GCN-NEXT:    v_cndmask_b32_e64 v30, v37, v36, s[4:5]
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32
-; GCN-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:116
-; GCN-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:244
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:100
-; GCN-NEXT:    s_waitcnt expcnt(3)
+; GCN-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:236
+; GCN-NEXT:    s_waitcnt expcnt(5)
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:232
+; GCN-NEXT:    s_waitcnt expcnt(4)
 ; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:228
+; GCN-NEXT:    s_waitcnt expcnt(3)
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:224
 ; GCN-NEXT:    s_waitcnt expcnt(2)
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:84
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:220
 ; GCN-NEXT:    s_waitcnt expcnt(1)
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:212
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:216
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:68
-; GCN-NEXT:    v_mul_f32_e32 v38, 1.0, v46
-; GCN-NEXT:    v_mul_f32_e32 v46, 1.0, v47
-; GCN-NEXT:    s_waitcnt vmcnt(6)
-; GCN-NEXT:    v_mul_f32_e32 v36, 1.0, v36
-; GCN-NEXT:    s_waitcnt vmcnt(5)
-; GCN-NEXT:    v_mul_f32_e32 v37, 1.0, v37
-; GCN-NEXT:    v_mul_f32_e32 v44, 1.0, v44
-; GCN-NEXT:    v_mul_f32_e32 v45, 1.0, v45
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:212
+; GCN-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:128
+; GCN-NEXT:    v_mul_f32_e32 v42, 1.0, v42
+; GCN-NEXT:    s_waitcnt vmcnt(10)
+; GCN-NEXT:    v_mul_f32_e32 v43, 1.0, v45
+; GCN-NEXT:    v_mul_f32_e32 v41, 1.0, v41
+; GCN-NEXT:    s_waitcnt vmcnt(9)
+; GCN-NEXT:    v_mul_f32_e32 v44, 1.0, v46
 ; GCN-NEXT:    v_mul_f32_e32 v55, 1.0, v55
-; GCN-NEXT:    v_mul_f32_e32 v40, 1.0, v40
+; GCN-NEXT:    s_waitcnt vmcnt(8)
+; GCN-NEXT:    v_mul_f32_e32 v45, 1.0, v47
+; GCN-NEXT:    v_mul_f32_e32 v54, 1.0, v54
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    v_mul_f32_e32 v36, 1.0, v36
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v29
-; GCN-NEXT:    v_cndmask_b32_e64 v29, v46, v38, s[4:5]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v48
-; GCN-NEXT:    v_cndmask_b32_e64 v36, v37, v36, s[4:5]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v50
-; GCN-NEXT:    v_cndmask_b32_e64 v37, v45, v44, s[4:5]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v52
-; GCN-NEXT:    v_cndmask_b32_e64 v38, v40, v55, s[4:5]
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:136
-; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:12
-; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:140
-; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:16
-; GCN-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:144
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:24
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:152
-; GCN-NEXT:    v_and_b32_e32 v9, 1, v9
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_and_b32_e32 v21, 1, v21
-; GCN-NEXT:    v_and_b32_e32 v25, 1, v25
-; GCN-NEXT:    v_and_b32_e32 v24, 1, v24
-; GCN-NEXT:    v_and_b32_e32 v23, 1, v23
+; GCN-NEXT:    v_cndmask_b32_e64 v29, v43, v42, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v28
+; GCN-NEXT:    v_cndmask_b32_e64 v28, v44, v41, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v27
+; GCN-NEXT:    v_cndmask_b32_e64 v27, v45, v55, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v53
+; GCN-NEXT:    v_cndmask_b32_e64 v36, v36, v54, s[4:5]
+; GCN-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:132
+; GCN-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:8
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:136
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:12
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:140
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:144
+; GCN-NEXT:    v_and_b32_e32 v3, 1, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 1, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 1, v6
+; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
 ; GCN-NEXT:    v_and_b32_e32 v22, 1, v22
-; GCN-NEXT:    v_mul_f32_e32 v53, 1.0, v53
-; GCN-NEXT:    v_mul_f32_e32 v54, 1.0, v54
+; GCN-NEXT:    v_and_b32_e32 v23, 1, v23
+; GCN-NEXT:    v_and_b32_e32 v24, 1, v24
+; GCN-NEXT:    v_and_b32_e32 v25, 1, v25
+; GCN-NEXT:    v_mul_f32_e32 v52, 1.0, v52
+; GCN-NEXT:    s_waitcnt vmcnt(14)
+; GCN-NEXT:    v_mul_f32_e32 v46, 1.0, v56
+; GCN-NEXT:    v_mul_f32_e32 v51, 1.0, v51
+; GCN-NEXT:    s_waitcnt vmcnt(13)
+; GCN-NEXT:    v_mul_f32_e32 v47, 1.0, v57
+; GCN-NEXT:    v_mul_f32_e32 v50, 1.0, v50
 ; GCN-NEXT:    s_waitcnt vmcnt(12)
-; GCN-NEXT:    v_mul_f32_e32 v47, 1.0, v56
-; GCN-NEXT:    s_waitcnt vmcnt(11)
-; GCN-NEXT:    v_mul_f32_e32 v56, 1.0, v57
+; GCN-NEXT:    v_mul_f32_e32 v56, 1.0, v58
 ; GCN-NEXT:    v_mul_f32_e32 v49, 1.0, v49
-; GCN-NEXT:    v_mul_f32_e32 v51, 1.0, v51
-; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT:    v_mul_f32_e32 v39, 1.0, v39
+; GCN-NEXT:    s_waitcnt vmcnt(11)
+; GCN-NEXT:    v_mul_f32_e32 v57, 1.0, v59
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v25
-; GCN-NEXT:    v_cndmask_b32_e64 v25, v54, v53, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v25, v46, v52, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v24
-; GCN-NEXT:    v_cndmask_b32_e64 v24, v56, v47, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v24, v47, v51, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v23
-; GCN-NEXT:    v_cndmask_b32_e64 v23, v51, v49, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v23, v56, v50, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v22
-; GCN-NEXT:    v_cndmask_b32_e64 v22, v39, v28, s[4:5]
-; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:72
-; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:200
-; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
-; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:204
-; GCN-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:80
-; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:208
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:88
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:216
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
+; GCN-NEXT:    v_cndmask_b32_e64 v22, v57, v49, s[4:5]
+; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:68
+; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:196
+; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:72
+; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:200
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:76
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:204
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:80
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:208
 ; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
-; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
-; GCN-NEXT:    v_and_b32_e32 v14, 1, v14
-; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
+; GCN-NEXT:    v_and_b32_e32 v21, 1, v21
+; GCN-NEXT:    v_mul_f32_e32 v48, 1.0, v48
+; GCN-NEXT:    s_waitcnt vmcnt(14)
+; GCN-NEXT:    v_mul_f32_e32 v58, 1.0, v60
+; GCN-NEXT:    v_mul_f32_e32 v39, 1.0, v39
+; GCN-NEXT:    v_mul_f32_e32 v59, 1.0, v61
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_mul_f32_e32 v46, 1.0, v46
+; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_mul_f32_e32 v47, 1.0, v47
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_mul_f32_e32 v56, 1.0, v56
-; GCN-NEXT:    v_mul_f32_e32 v57, 1.0, v58
-; GCN-NEXT:    v_mul_f32_e32 v58, 1.0, v59
-; GCN-NEXT:    v_mul_f32_e32 v53, 1.0, v53
-; GCN-NEXT:    v_mul_f32_e32 v54, 1.0, v54
-; GCN-NEXT:    v_mul_f32_e32 v49, 1.0, v49
-; GCN-NEXT:    v_mul_f32_e32 v51, 1.0, v51
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v57, 1.0, v57
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v21
-; GCN-NEXT:    v_cndmask_b32_e64 v21, v56, v47, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v21, v58, v48, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v20
-; GCN-NEXT:    v_cndmask_b32_e64 v20, v58, v57, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v20, v59, v39, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v19
-; GCN-NEXT:    v_cndmask_b32_e64 v19, v54, v53, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v19, v57, v56, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v18
-; GCN-NEXT:    v_cndmask_b32_e64 v18, v51, v49, s[4:5]
-; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:196
-; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:52
-; GCN-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:180
-; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:36
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:164
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:148
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:4
-; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT:    v_mul_f32_e32 v39, 1.0, v39
-; GCN-NEXT:    v_mul_f32_e32 v59, 1.0, v60
-; GCN-NEXT:    s_waitcnt vmcnt(7)
-; GCN-NEXT:    v_mul_f32_e32 v49, 1.0, v49
-; GCN-NEXT:    v_mul_f32_e32 v34, 1.0, v34
-; GCN-NEXT:    v_mul_f32_e32 v35, 1.0, v35
+; GCN-NEXT:    v_cndmask_b32_e64 v18, v47, v46, s[4:5]
+; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:20
+; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:148
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:152
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:156
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:160
+; GCN-NEXT:    v_and_b32_e32 v7, 1, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 1, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 1, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 1, v10
+; GCN-NEXT:    v_and_b32_e32 v14, 1, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
+; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
+; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
 ; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_mul_f32_e32 v34, 1.0, v34
+; GCN-NEXT:    v_mul_f32_e32 v35, 1.0, v35
+; GCN-NEXT:    v_mul_f32_e32 v49, 1.0, v49
+; GCN-NEXT:    v_mul_f32_e32 v50, 1.0, v50
+; GCN-NEXT:    v_mul_f32_e32 v51, 1.0, v51
+; GCN-NEXT:    v_mul_f32_e32 v52, 1.0, v52
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v17
-; GCN-NEXT:    v_cndmask_b32_e64 v17, v39, v28, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v17, v52, v51, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
-; GCN-NEXT:    v_cndmask_b32_e64 v16, v49, v59, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v16, v50, v49, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v15
 ; GCN-NEXT:    v_cndmask_b32_e64 v15, v35, v34, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v14
 ; GCN-NEXT:    v_cndmask_b32_e64 v14, v33, v32, s[4:5]
-; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:28
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:156
-; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
-; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:160
-; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:40
-; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:168
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:164
+; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:40
+; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:168
 ; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:44
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:172
-; GCN-NEXT:    v_and_b32_e32 v12, 1, v12
+; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:172
+; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:48
+; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:176
 ; GCN-NEXT:    v_and_b32_e32 v11, 1, v11
-; GCN-NEXT:    v_and_b32_e32 v10, 1, v10
-; GCN-NEXT:    v_and_b32_e32 v8, 1, v8
-; GCN-NEXT:    v_and_b32_e32 v7, 1, v7
-; GCN-NEXT:    v_and_b32_e32 v6, 1, v6
-; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
-; GCN-NEXT:    v_and_b32_e32 v3, 1, v3
-; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v41, v42, v41, vcc
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:132
-; GCN-NEXT:    v_and_b32_e32 v43, 1, v43
-; GCN-NEXT:    v_mul_f32_e32 v40, 1.0, v40
+; GCN-NEXT:    v_and_b32_e32 v12, 1, v12
+; GCN-NEXT:    v_cndmask_b32_e32 v38, v38, v40, vcc
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:256
+; GCN-NEXT:    v_and_b32_e32 v26, 1, v26
+; GCN-NEXT:    v_mul_f32_e32 v53, 1.0, v53
+; GCN-NEXT:    v_mul_f32_e32 v54, 1.0, v54
+; GCN-NEXT:    v_mul_f32_e32 v55, 1.0, v55
+; GCN-NEXT:    v_mul_f32_e32 v41, 1.0, v41
+; GCN-NEXT:    v_mul_f32_e32 v42, 1.0, v42
+; GCN-NEXT:    v_mul_f32_e32 v43, 1.0, v43
 ; GCN-NEXT:    v_mul_f32_e32 v44, 1.0, v44
 ; GCN-NEXT:    v_mul_f32_e32 v45, 1.0, v45
+; GCN-NEXT:    s_waitcnt vmcnt(14)
+; GCN-NEXT:    v_mul_f32_e32 v39, 1.0, v39
+; GCN-NEXT:    v_mul_f32_e32 v48, 1.0, v48
 ; GCN-NEXT:    v_mul_f32_e32 v46, 1.0, v46
-; GCN-NEXT:    s_waitcnt vmcnt(4)
+; GCN-NEXT:    s_waitcnt vmcnt(13)
+; GCN-NEXT:    v_mul_f32_e32 v47, 1.0, v47
+; GCN-NEXT:    s_waitcnt vmcnt(12)
+; GCN-NEXT:    v_mul_f32_e32 v56, 1.0, v56
+; GCN-NEXT:    s_waitcnt vmcnt(11)
+; GCN-NEXT:    v_mul_f32_e32 v57, 1.0, v57
+; GCN-NEXT:    s_waitcnt vmcnt(10)
+; GCN-NEXT:    v_mul_f32_e32 v58, 1.0, v58
+; GCN-NEXT:    s_waitcnt vmcnt(9)
+; GCN-NEXT:    v_mul_f32_e32 v59, 1.0, v59
+; GCN-NEXT:    s_waitcnt vmcnt(8)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    s_waitcnt vmcnt(6)
+; GCN-NEXT:    v_mul_f32_e32 v34, 1.0, v34
+; GCN-NEXT:    s_waitcnt vmcnt(5)
 ; GCN-NEXT:    v_mul_f32_e32 v35, 1.0, v35
+; GCN-NEXT:    s_waitcnt vmcnt(4)
+; GCN-NEXT:    v_mul_f32_e32 v49, 1.0, v49
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    v_mul_f32_e32 v39, 1.0, v39
-; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT:    v_mul_f32_e32 v51, 1.0, v51
-; GCN-NEXT:    v_mul_f32_e32 v53, 1.0, v53
-; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:    v_mul_f32_e32 v50, 1.0, v50
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_mul_f32_e32 v49, 1.0, v49
+; GCN-NEXT:    v_mul_f32_e32 v51, 1.0, v51
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_mul_f32_e32 v59, 1.0, v59
-; GCN-NEXT:    v_mul_f32_e32 v54, 1.0, v54
-; GCN-NEXT:    v_mul_f32_e32 v47, 1.0, v47
-; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT:    v_mul_f32_e32 v34, 1.0, v34
-; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GCN-NEXT:    v_mul_f32_e32 v56, 1.0, v56
-; GCN-NEXT:    v_mul_f32_e32 v57, 1.0, v57
 ; GCN-NEXT:    v_mul_f32_e32 v52, 1.0, v52
-; GCN-NEXT:    v_mul_f32_e32 v55, 1.0, v55
-; GCN-NEXT:    v_mul_f32_e32 v48, 1.0, v48
-; GCN-NEXT:    v_mul_f32_e32 v50, 1.0, v50
-; GCN-NEXT:    v_mul_f32_e32 v58, 1.0, v58
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:    v_mul_f32_e32 v37, 1.0, v37
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v42, 1.0, v42
+; GCN-NEXT:    v_mul_f32_e32 v40, 1.0, v40
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
-; GCN-NEXT:    v_cndmask_b32_e32 v12, v53, v51, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v12, v31, v13, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
-; GCN-NEXT:    v_cndmask_b32_e32 v11, v31, v13, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v11, v52, v51, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
-; GCN-NEXT:    v_cndmask_b32_e32 v10, v59, v49, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v10, v50, v49, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
-; GCN-NEXT:    v_cndmask_b32_e32 v9, v39, v35, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v9, v35, v34, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
-; GCN-NEXT:    v_cndmask_b32_e32 v8, v47, v54, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v8, v33, v32, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GCN-NEXT:    v_cndmask_b32_e32 v7, v34, v33, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v7, v59, v58, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GCN-NEXT:    v_cndmask_b32_e32 v6, v32, v28, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v6, v57, v56, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
-; GCN-NEXT:    v_cndmask_b32_e32 v5, v46, v45, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v5, v47, v46, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT:    v_cndmask_b32_e32 v4, v57, v56, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v48, v39, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v55, v52, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v45, v44, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v50, v48, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v43, v42, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v44, v40, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v41, v55, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v42, v58, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v43
-; GCN-NEXT:    v_cndmask_b32_e32 v31, v27, v26, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v54, v53, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v26
+; GCN-NEXT:    v_cndmask_b32_e32 v31, v40, v37, vcc
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
@@ -37888,7 +37905,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v41
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v38
 ; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
@@ -37901,25 +37918,26 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v38
-; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v37
-; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v36
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v36
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/blender-coalescer-verifier-error-empty-subrange.mir b/llvm/test/CodeGen/AMDGPU/blender-coalescer-verifier-error-empty-subrange.mir
index 3b1951e287d3..007c4c094029 100644
--- a/llvm/test/CodeGen/AMDGPU/blender-coalescer-verifier-error-empty-subrange.mir
+++ b/llvm/test/CodeGen/AMDGPU/blender-coalescer-verifier-error-empty-subrange.mir
@@ -20,18 +20,18 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_NOP 0, implicit-def %0
-  ; CHECK-NEXT:   undef %1.sub0:sgpr_128 = S_MOV_B32 0
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 0
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:sgpr_128 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef %1.sub0:sgpr_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[DEF:%[0-9]+]].sub0:sgpr_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = IMPLICIT_DEF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   S_NOP 0, implicit %0
-  ; CHECK-NEXT:   S_NOP 0, implicit %1.sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit [[DEF]]
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]].sub0
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
     S_CBRANCH_SCC0 %bb.2, implicit undef $scc
@@ -73,12 +73,12 @@ body:             |
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:sgpr_128 = S_MOV_B32 123
-  ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 123
+  ; CHECK-NEXT:   undef [[S_MOV_B32_1:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 123
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 123
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   S_NOP 0, implicit %0
   ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_1]]
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]]
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
     S_CBRANCH_SCC0 %bb.2, implicit undef $scc
diff --git a/llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir b/llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir
index 69cfbeeb9a49..6483ff28c0de 100644
--- a/llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir
+++ b/llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir
@@ -34,8 +34,8 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]], 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
   ; CHECK-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %15, 0, implicit $exec
-  ; CHECK-NEXT:   %7:vgpr_32, dead %8:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed %7, %subreg.sub1
+  ; CHECK-NEXT:   [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1
   ; CHECK-NEXT:   [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
   ; CHECK-NEXT:   [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
   ; CHECK-NEXT:   GLOBAL_STORE_BYTE killed [[V_MOV_B]], killed [[GLOBAL_LOAD_UBYTE]], 0, 0, implicit $exec :: (store (s8), addrspace 1)
@@ -55,7 +55,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
-  ; CHECK-NEXT:   dead %13:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[V_MOV_B1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+  ; CHECK-NEXT:   dead [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[V_MOV_B1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
   ; CHECK-NEXT:   S_BRANCH %bb.6
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
@@ -71,7 +71,7 @@ body:             |
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll
index 590d73b310a2..41eb2b7bb274 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll
@@ -18,6 +18,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_offset_no_rtn(float %val, <4 x i32
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_offset_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
@@ -50,6 +51,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_offen_no_rtn(float %val, <4 x i32>
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_offen_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
@@ -83,6 +85,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_idxen_no_rtn(float %val, <4 x i32>
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_idxen_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
@@ -118,6 +121,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_bothen_no_rtn(float %val, <4 x i32
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_bothen_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
@@ -158,6 +162,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_offset_no_rtn(float %val, ptr
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
@@ -202,6 +207,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_offen_no_rtn(float %val, ptr a
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
@@ -247,6 +253,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_idxen_no_rtn(float %val, ptr a
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
@@ -294,6 +301,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_bothen_no_rtn(float %val, ptr
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll
index dc164191b216..f964da2ddf40 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll
@@ -18,6 +18,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offset_rtn(float %val, <4 x i32>
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_atomic_fadd_f32_offset_rtn
   ; GFX11: bb.0 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
@@ -52,6 +53,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offen_rtn(float %val, <4 x i32> i
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_atomic_fadd_f32_offen_rtn
   ; GFX11: bb.0 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
@@ -87,6 +89,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_idxen_rtn(float %val, <4 x i32> i
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_atomic_fadd_f32_idxen_rtn
   ; GFX11: bb.0 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
@@ -124,6 +127,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_bothen_rtn(float %val, <4 x i32>
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_atomic_fadd_f32_bothen_rtn
   ; GFX11: bb.0 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
@@ -166,6 +170,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offset_rtn(float %val, ptr ad
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_rtn
   ; GFX11: bb.0 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
@@ -212,6 +217,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offen_rtn(float %val, ptr add
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_rtn
   ; GFX11: bb.0 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
@@ -259,6 +265,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_idxen_rtn(float %val, ptr add
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_rtn
   ; GFX11: bb.0 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
@@ -308,6 +315,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_bothen_rtn(float %val, ptr ad
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_rtn
   ; GFX11: bb.0 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll
index ea98b61a3b7d..0b62977613f1 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll
@@ -17,6 +17,7 @@ define amdgpu_ps void @buffer_atomic_fadd_v2f16_offset_no_rtn(<2 x half> %val, <
   ; GFX908-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_offset_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
@@ -49,6 +50,7 @@ define amdgpu_ps void @buffer_atomic_fadd_v2f16_offen_no_rtn(<2 x half> %val, <4
   ; GFX908-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_offen_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
@@ -82,6 +84,7 @@ define amdgpu_ps void @buffer_atomic_fadd_v2f16_idxen_no_rtn(<2 x half> %val, <4
   ; GFX908-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_idxen_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
@@ -117,6 +120,7 @@ define amdgpu_ps void @buffer_atomic_fadd_v2f16_bothen_no_rtn(<2 x half> %val, <
   ; GFX908-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_bothen_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
@@ -157,6 +161,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_offset_no_rtn(<2 x half> %va
   ; GFX908-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offset_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
@@ -201,6 +206,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_offen_no_rtn(<2 x half> %val
   ; GFX908-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offen_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
@@ -246,6 +252,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_idxen_no_rtn(<2 x half> %val
   ; GFX908-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_idxen_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
@@ -293,6 +300,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_bothen_no_rtn(<2 x half> %va
   ; GFX908-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_bothen_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 7c1c24f4a67d..dfadd8d205b0 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -525,6 +525,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_max_num_f32_e32 v1, 0, v1
 ; GFX12-NEXT:    v_min_num_f32_e32 v2, 1.0, v1
 ; GFX12-NEXT:    global_store_b32 v0, v2, s[0:1]
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v1, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-identity-copies-undef-subregs.mir b/llvm/test/CodeGen/AMDGPU/coalesce-identity-copies-undef-subregs.mir
index 23590cad8327..8e0c544a3a57 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-identity-copies-undef-subregs.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-identity-copies-undef-subregs.mir
@@ -18,7 +18,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = COPY $vgpr0
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -29,7 +29,7 @@ body:             |
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = nofpexcept V_CEIL_F32_e32 %0.sub1, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = nofpexcept V_CEIL_F32_e32 [[COPY]].sub1, implicit $mode, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.1
   bb.0:
     liveins: $vgpr0
@@ -59,20 +59,20 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = COPY $vgpr0
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_NOP 0, implicit undef %0.sub0
-  ; CHECK-NEXT:   S_NOP 0, implicit undef %0.sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit undef [[COPY]].sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit undef [[COPY]].sub0
   ; CHECK-NEXT:   S_CBRANCH_EXECNZ %bb.1, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = nofpexcept V_CEIL_F32_e32 %0.sub1, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = nofpexcept V_CEIL_F32_e32 [[COPY]].sub1, implicit $mode, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.1
   bb.0:
     liveins: $vgpr0
@@ -102,7 +102,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = COPY $vgpr0
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -113,7 +113,7 @@ body:             |
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = nofpexcept V_MUL_F32_e32 0, %0.sub1, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = nofpexcept V_MUL_F32_e32 0, [[COPY]].sub1, implicit $mode, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.1
   bb.0:
     liveins: $vgpr0
@@ -143,7 +143,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = COPY $vgpr0
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -154,7 +154,7 @@ body:             |
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   %0.sub1:vreg_64 = nofpexcept V_MUL_F32_e32 0, %0.sub1, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]].sub1:vreg_64 = nofpexcept V_MUL_F32_e32 0, [[COPY]].sub1, implicit $mode, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.1
   bb.0:
     liveins: $vgpr0
@@ -185,7 +185,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = COPY $vgpr0
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
@@ -195,12 +195,12 @@ body:             |
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_NOP 0, implicit undef %0.sub0
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = nofpexcept V_CEIL_F32_e32 %0.sub1, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   S_NOP 0, implicit undef [[COPY]].sub0
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = nofpexcept V_CEIL_F32_e32 [[COPY]].sub1, implicit $mode, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   S_NOP 0, implicit undef %0.sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit undef [[COPY]].sub0
   bb.0:
     liveins: $vgpr0
 
@@ -229,7 +229,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = COPY $vgpr0
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
@@ -237,7 +237,7 @@ body:             |
   ; CHECK-NEXT:   S_CBRANCH_EXECNZ %bb.1, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   S_NOP 0, implicit undef %0.sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit undef [[COPY]].sub0
   bb.0:
     liveins: $vgpr0
 
@@ -261,7 +261,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = COPY $vgpr0
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
@@ -269,7 +269,7 @@ body:             |
   ; CHECK-NEXT:   S_CBRANCH_EXECNZ %bb.1, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   S_NOP 0, implicit %0.sub1
+  ; CHECK-NEXT:   S_NOP 0, implicit [[COPY]].sub1
   bb.0:
     liveins: $vgpr0
 
@@ -295,7 +295,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead undef %2.sub1:vreg_64 = COPY $vgpr0
+  ; CHECK-NEXT:   dead undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -336,7 +336,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead undef %0.sub1:vreg_64 = COPY $vgpr0
+  ; CHECK-NEXT:   dead undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -376,7 +376,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead undef %0.sub1:vreg_64 = COPY $vgpr0
+  ; CHECK-NEXT:   dead undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -417,7 +417,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = COPY $vgpr0
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -428,7 +428,7 @@ body:             |
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = nofpexcept V_CEIL_F32_e32 %0.sub1, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = nofpexcept V_CEIL_F32_e32 [[COPY]].sub1, implicit $mode, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.1
   bb.0:
     liveins: $vgpr0
@@ -458,7 +458,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = COPY $vgpr0
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -469,7 +469,7 @@ body:             |
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   %0.sub1:vreg_64 = nofpexcept V_CEIL_F32_e32 %0.sub1, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]].sub1:vreg_64 = nofpexcept V_CEIL_F32_e32 [[COPY]].sub1, implicit $mode, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.1
   bb.0:
     liveins: $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-into-dead-subreg-copies.mir b/llvm/test/CodeGen/AMDGPU/coalesce-into-dead-subreg-copies.mir
index 45ccb4b8866e..6f1e88866160 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-into-dead-subreg-copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-into-dead-subreg-copies.mir
@@ -18,7 +18,7 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM undef %1:sgpr_64, 24, 0 :: (dereferenceable invariant load (s64), addrspace 4)
+  ; CHECK-NEXT:   dead [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM undef %1:sgpr_64, 24, 0 :: (dereferenceable invariant load (s64), addrspace 4)
   ; CHECK-NEXT:   S_BRANCH %bb.1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-liveout-undef-copy.mir b/llvm/test/CodeGen/AMDGPU/coalesce-liveout-undef-copy.mir
index 1f235ebccfa3..b477ec85d3a1 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-liveout-undef-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-liveout-undef-copy.mir
@@ -23,19 +23,19 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead %2:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY]].sub0:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]].sub0:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead %4:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V4 [[COPY]], undef %5:sgpr_32, 11, implicit-def $m0, implicit $m0, implicit $exec
+  ; CHECK-NEXT:   dead [[V_INDIRECT_REG_READ_GPR_IDX_B32_V4_:%[0-9]+]]:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V4 [[COPY]], undef %5:sgpr_32, 11, implicit-def $m0, implicit $m0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.4(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_EXECNZ %bb.1, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-remat-dead-use.mir b/llvm/test/CodeGen/AMDGPU/coalescer-remat-dead-use.mir
index f4b8f12ad246..35f0399b78aa 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-remat-dead-use.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-remat-dead-use.mir
@@ -89,7 +89,7 @@ body:             |
     ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 1, [[COPY]], implicit $exec
     ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 [[V_ADD_U32_e32_]], implicit $exec
-    ; GCN-NEXT: SI_RETURN_TO_EPILOG $vgpr0
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG $vgpr0, implicit [[V_ADD_U32_e32_]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_ADD_U32_e32 1, %0, implicit $exec
     %2:vgpr_32 = V_MOV_B32_e32 %1, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-removepartial-extend-undef-subrange.mir b/llvm/test/CodeGen/AMDGPU/coalescer-removepartial-extend-undef-subrange.mir
index b6a1719b3c70..e9a6e89245e9 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-removepartial-extend-undef-subrange.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-removepartial-extend-undef-subrange.mir
@@ -22,29 +22,29 @@ body:             |
   ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $vgpr3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; CHECK-NEXT:   undef %1.sub0:vreg_64 = COPY [[COPY]]
-  ; CHECK-NEXT:   undef %2.sub0:vreg_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   undef [[COPY1:%[0-9]+]].sub0:vreg_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   undef [[COPY2:%[0-9]+]].sub0:vreg_64 = COPY [[COPY]]
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.2, implicit undef $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.4
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_NOP 0, implicit %2.sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit [[COPY2]].sub0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x04000000), %bb.2(0x7c000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_64 = COPY %2
-  ; CHECK-NEXT:   %1.sub0:vreg_64 = COPY [[COPY1]].sub0
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY %1
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]].sub0:vreg_64 = COPY [[COPY3]].sub0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY1]]
   ; CHECK-NEXT:   S_CBRANCH_EXECNZ %bb.2, implicit undef $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY3]]
   ; CHECK-NEXT:   S_BRANCH %bb.1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-subranges-prune-kill-copy.mir b/llvm/test/CodeGen/AMDGPU/coalescer-subranges-prune-kill-copy.mir
index b28e9f86a43d..2fb3467da9a5 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-subranges-prune-kill-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-subranges-prune-kill-copy.mir
@@ -12,8 +12,8 @@ body:             |
   ; GCN: bb.0:
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   undef %1.sub0:vreg_128 = IMPLICIT_DEF
-  ; GCN-NEXT:   %1.sub1:vreg_128 = IMPLICIT_DEF
+  ; GCN-NEXT:   undef [[DEF:%[0-9]+]].sub0:vreg_128 = IMPLICIT_DEF
+  ; GCN-NEXT:   [[DEF:%[0-9]+]].sub1:vreg_128 = IMPLICIT_DEF
   ; GCN-NEXT:   S_BRANCH %bb.2
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
@@ -22,7 +22,7 @@ body:             |
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
-  ; GCN-NEXT:   [[DEF]].sub2:vreg_128 = COPY undef %3:sreg_32
+  ; GCN-NEXT:   [[DEF:%[0-9]+]].sub2:vreg_128 = COPY undef %3:sreg_32
   ; GCN-NEXT:   S_ENDPGM 0, implicit [[DEF]]
   bb.0:
     undef %0.sub0:vreg_128 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/coalescing-subreg-was-undef-but-became-def.mir b/llvm/test/CodeGen/AMDGPU/coalescing-subreg-was-undef-but-became-def.mir
index de37345a87fb..b988aec3971e 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescing-subreg-was-undef-but-became-def.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescing-subreg-was-undef-but-became-def.mir
@@ -16,17 +16,17 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %4.sub2:sgpr_128 = S_MOV_B32 0
-  ; CHECK-NEXT:   dead undef %7.sub0:sgpr_128 = S_MOV_B32 0
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 0
+  ; CHECK-NEXT:   dead undef [[S_MOV_B32_1:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 0
   ; CHECK-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit undef $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   %4.sub0:sgpr_128 = S_MOV_B32 -1
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 -1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   S_NOP 0, implicit %4
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]]
   bb.0:
     successors: %bb.1, %bb.2
 
diff --git a/llvm/test/CodeGen/AMDGPU/coalescing_makes_lanes_undef.mir b/llvm/test/CodeGen/AMDGPU/coalescing_makes_lanes_undef.mir
index 0bfdbf2629a1..cc839ff966ab 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescing_makes_lanes_undef.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescing_makes_lanes_undef.mir
@@ -19,18 +19,18 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:sgpr_64 = S_MOV_B32 1
-  ; CHECK-NEXT:   %0.sub1:sgpr_64 = S_MOV_B32 2
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 1
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = S_MOV_B32 2
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:sgpr_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = IMPLICIT_DEF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   S_NOP 0, implicit %0.sub0
-  ; CHECK-NEXT:   S_NOP 0, implicit %0
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]].sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]]
   bb.0:
     successors: %bb.1, %bb.2
     S_CBRANCH_SCC0 %bb.2, implicit undef $scc
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf2.mir b/llvm/test/CodeGen/AMDGPU/collapse-endcf2.mir
index bbc585d01142..d62a63286b3b 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf2.mir
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf2.mir
@@ -37,15 +37,15 @@ body:             |
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   undef %5.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[COPY]], 9, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-  ; GCN-NEXT:   undef %6.sub0:vreg_64 = V_LSHLREV_B32_e32 2, [[COPY1]], implicit $exec
-  ; GCN-NEXT:   %6.sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY %5.sub1
-  ; GCN-NEXT:   undef %8.sub0:vreg_64, %9:sreg_64_xexec = V_ADD_CO_U32_e64 %5.sub0, %6.sub0, 0, implicit $exec
-  ; GCN-NEXT:   %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec
-  ; GCN-NEXT:   %5.sub3:sgpr_128 = S_MOV_B32 61440
-  ; GCN-NEXT:   %5.sub2:sgpr_128 = S_MOV_B32 0
-  ; GCN-NEXT:   BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+  ; GCN-NEXT:   undef [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[COPY]], 9, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+  ; GCN-NEXT:   undef [[V_LSHLREV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_LSHLREV_B32_e32 2, [[COPY1]], implicit $exec
+  ; GCN-NEXT:   [[V_LSHLREV_B32_e32_:%[0-9]+]].sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GCN-NEXT:   undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub0:vreg_64, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[V_LSHLREV_B32_e32_]].sub0, 0, implicit $exec
+  ; GCN-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]].sub1:vreg_64, dead [[V_ADDC_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 61440
+  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 0
+  ; GCN-NEXT:   BUFFER_STORE_DWORD_ADDR64 [[V_LSHLREV_B32_e32_]].sub1, [[V_LSHLREV_B32_e32_]], [[S_LOAD_DWORDX2_IMM]], 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 1)
   ; GCN-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec
   ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
   ; GCN-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
@@ -56,17 +56,17 @@ body:             |
   ; GCN-NEXT: bb.2:
   ; GCN-NEXT:   successors: %bb.3(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   %5.sub0:sgpr_128 = COPY %5.sub2
-  ; GCN-NEXT:   %5.sub1:sgpr_128 = COPY %5.sub2
+  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub2
+  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub1:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub2
   ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-  ; GCN-NEXT:   BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+  ; GCN-NEXT:   BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], [[V_ADD_CO_U32_e64_]], [[S_LOAD_DWORDX2_IMM]], 0, 4, 0, 0, implicit $exec :: (store (s32), addrspace 1)
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.3:
   ; GCN-NEXT:   successors: %bb.4(0x80000000)
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; GCN-NEXT:   dead %16:sreg_64 = SI_CALL [[DEF]], @func, csr_amdgpu
+  ; GCN-NEXT:   dead [[SI_CALL:%[0-9]+]]:sreg_64 = SI_CALL [[DEF]], @func, csr_amdgpu
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.4:
   ; GCN-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
diff --git a/llvm/test/CodeGen/AMDGPU/combine-sreg64-inits.mir b/llvm/test/CodeGen/AMDGPU/combine-sreg64-inits.mir
index 9ca28f9f230d..57afb456d603 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-sreg64-inits.mir
+++ b/llvm/test/CodeGen/AMDGPU/combine-sreg64-inits.mir
@@ -7,7 +7,7 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: combine_sreg64_inits
-    ; GCN: dead %0:sgpr_64 = S_MOV_B64_IMM_PSEUDO 8589934593
+    ; GCN: dead [[S_MOV_B:%[0-9]+]]:sgpr_64 = S_MOV_B64_IMM_PSEUDO 8589934593
     ; GCN-NEXT: S_NOP 0
     undef %0.sub0:sgpr_64 = S_MOV_B32 1
     S_NOP 0
@@ -19,7 +19,7 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: combine_sreg64_inits_swap
-    ; GCN: dead %0:sgpr_64 = S_MOV_B64_IMM_PSEUDO 8589934593
+    ; GCN: dead [[S_MOV_B:%[0-9]+]]:sgpr_64 = S_MOV_B64_IMM_PSEUDO 8589934593
     ; GCN-NEXT: S_NOP 0
     undef %0.sub1:sgpr_64 = S_MOV_B32 2
     S_NOP 0
@@ -32,9 +32,9 @@ body: |
   bb.0:
     ; GCN-LABEL: name: sreg64_subreg_copy_0
     ; GCN: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-    ; GCN-NEXT: undef %1.sub0:sgpr_64 = COPY [[DEF]]
-    ; GCN-NEXT: %1.sub0:sgpr_64 = S_MOV_B32 1
-    ; GCN-NEXT: dead %1.sub1:sgpr_64 = S_MOV_B32 2
+    ; GCN-NEXT: undef [[COPY:%[0-9]+]].sub0:sgpr_64 = COPY [[DEF]]
+    ; GCN-NEXT: [[COPY:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 1
+    ; GCN-NEXT: dead [[COPY:%[0-9]+]].sub1:sgpr_64 = S_MOV_B32 2
     %0:sgpr_32 = IMPLICIT_DEF
     undef %1.sub0:sgpr_64 = COPY %0:sgpr_32
     %1.sub0:sgpr_64 = S_MOV_B32 1
@@ -47,9 +47,9 @@ body: |
   bb.0:
     ; GCN-LABEL: name: sreg64_subreg_copy_1
     ; GCN: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-    ; GCN-NEXT: undef %1.sub0:sgpr_64 = S_MOV_B32 1
-    ; GCN-NEXT: %1.sub1:sgpr_64 = COPY [[DEF]]
-    ; GCN-NEXT: dead %1.sub1:sgpr_64 = S_MOV_B32 2
+    ; GCN-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 1
+    ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[DEF]]
+    ; GCN-NEXT: dead [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = S_MOV_B32 2
     %0:sgpr_32 = IMPLICIT_DEF
     undef %1.sub0:sgpr_64 = S_MOV_B32 1
     %1.sub1:sgpr_64 = COPY %0:sgpr_32
@@ -62,9 +62,9 @@ body: |
   bb.0:
     ; GCN-LABEL: name: sreg64_subreg_copy_2
     ; GCN: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-    ; GCN-NEXT: undef %1.sub0:sgpr_64 = S_MOV_B32 1
-    ; GCN-NEXT: %1.sub1:sgpr_64 = S_MOV_B32 2
-    ; GCN-NEXT: dead %1.sub0:sgpr_64 = COPY [[DEF]]
+    ; GCN-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 1
+    ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = S_MOV_B32 2
+    ; GCN-NEXT: dead [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = COPY [[DEF]]
     %0:sgpr_32 = IMPLICIT_DEF
     undef %1.sub0:sgpr_64 = S_MOV_B32 1
     %1.sub1:sgpr_64 = S_MOV_B32 2
@@ -78,10 +78,10 @@ body:             |
   ; GCN: bb.0:
   ; GCN-NEXT:   successors: %bb.1(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   undef %0.sub0:sgpr_64 = S_MOV_B32 1
+  ; GCN-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 1
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
-  ; GCN-NEXT:   dead %0.sub1:sgpr_64 = S_MOV_B32 2
+  ; GCN-NEXT:   dead [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = S_MOV_B32 2
   bb.0:
     undef %0.sub0:sgpr_64 = S_MOV_B32 1
 
@@ -94,9 +94,9 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: sreg64_inits_two_defs_sub1
-    ; GCN: undef %0.sub0:sgpr_64 = S_MOV_B32 1
-    ; GCN-NEXT: %0.sub1:sgpr_64 = S_MOV_B32 2
-    ; GCN-NEXT: dead %0.sub1:sgpr_64 = S_MOV_B32 3
+    ; GCN: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 1
+    ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = S_MOV_B32 2
+    ; GCN-NEXT: dead [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = S_MOV_B32 3
     undef %0.sub0:sgpr_64 = S_MOV_B32 1
     %0.sub1:sgpr_64 = S_MOV_B32 2
     %0.sub1:sgpr_64 = S_MOV_B32 3
@@ -107,9 +107,9 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: sreg64_inits_two_defs_sub0
-    ; GCN: undef %0.sub0:sgpr_64 = S_MOV_B32 1
-    ; GCN-NEXT: %0.sub1:sgpr_64 = S_MOV_B32 2
-    ; GCN-NEXT: dead %0.sub0:sgpr_64 = S_MOV_B32 3
+    ; GCN: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 1
+    ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = S_MOV_B32 2
+    ; GCN-NEXT: dead [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 3
     undef %0.sub0:sgpr_64 = S_MOV_B32 1
     %0.sub1:sgpr_64 = S_MOV_B32 2
     %0.sub0:sgpr_64 = S_MOV_B32 3
@@ -120,8 +120,8 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: sreg64_inits_full_def
-    ; GCN: dead undef %1.sub0:sgpr_64 = S_MOV_B32 1
-    ; GCN-NEXT: dead %0:sgpr_64 = S_MOV_B64 3
+    ; GCN: dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 1
+    ; GCN-NEXT: dead [[S_MOV_B64_:%[0-9]+]]:sgpr_64 = S_MOV_B64 3
     undef %0.sub0:sgpr_64 = S_MOV_B32 1
     %0:sgpr_64 = S_MOV_B64 3
 ...
@@ -131,8 +131,8 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: sreg64_inits_imp_use
-    ; GCN: undef %0.sub0:sgpr_64 = S_MOV_B32 1, implicit $m0
-    ; GCN-NEXT: dead %0.sub1:sgpr_64 = S_MOV_B32 2
+    ; GCN: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 1, implicit $m0
+    ; GCN-NEXT: dead [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = S_MOV_B32 2
     undef %0.sub0:sgpr_64 = S_MOV_B32 1, implicit $m0
     %0.sub1:sgpr_64 = S_MOV_B32 2
 ...
@@ -142,8 +142,8 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: sreg64_inits_imp_def
-    ; GCN: undef %0.sub0:sgpr_64 = S_MOV_B32 1, implicit-def $scc
-    ; GCN-NEXT: dead %0.sub1:sgpr_64 = S_MOV_B32 2
+    ; GCN: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 1, implicit-def $scc
+    ; GCN-NEXT: dead [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = S_MOV_B32 2
     undef %0.sub0:sgpr_64 = S_MOV_B32 1, implicit-def $scc
     %0.sub1:sgpr_64 = S_MOV_B32 2
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/commute-vop3.mir b/llvm/test/CodeGen/AMDGPU/commute-vop3.mir
index 64a75e5f3a90..9a8805effb5b 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-vop3.mir
+++ b/llvm/test/CodeGen/AMDGPU/commute-vop3.mir
@@ -24,6 +24,7 @@ body: |
     ; GFX9-NEXT: [[V_XAD_U32_e64_:%[0-9]+]]:vgpr_32 = V_XAD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
     ; GFX9-NEXT: [[V_SUB_I32_e64_:%[0-9]+]]:vgpr_32 = V_SUB_I32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
     ; GFX9-NEXT: [[V_SUB_I32_e64_1:%[0-9]+]]:vgpr_32 = V_SUB_I32_e64 [[COPY1]], [[COPY]], 0, implicit $exec
+    ;
     ; GFX10-LABEL: name: commute_vop3
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll b/llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll
new file mode 100644
index 000000000000..4ae122bb3546
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll
@@ -0,0 +1,94 @@
+; NOTE: There must be no spill reload inside the loop starting with LBB0_1:
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck %s
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9-p9:192:256:256:32"
+target triple = "amdgcn-amd-amdhsa"
+
+define amdgpu_kernel void @foo(ptr %.sroa.1.0.copyload, ptr %0, ptr %1, ptr %2, ptr %3, ptr %4, ptr %5, ptr %6, ptr %7, ptr %8, ptr %9, ptr %10, ptr %11, ptr %12, ptr %13, ptr %14, ptr %15, ptr %16, ptr %17, ptr %18, ptr %19, ptr %20, ptr %21, ptr %22, ptr %23, ptr %24, ptr %25, ptr %26, ptr %27, ptr %28, ptr %29, ptr %30, ptr %31, ptr %32, ptr %33, double %34, double %35, double %36, float %37, float %38, float %39, float %40, ptr %41) {
+; CHECK-LABEL: foo:
+; CHECK-LABEL: .LBB0_1
+; CHECK-NOT: buffer_load_dword {{v[0-9]+}}
+
+.lr.ph:
+  %.pre = load double, ptr null, align 8
+  br label %42
+
+42:                                               ; preds = %42, %.lr.ph
+  %.0.i4402 = phi i32 [ 1, %.lr.ph ], [ 0, %42 ]
+  %43 = zext i32 %.0.i4402 to i64
+  %44 = load double, ptr %2, align 8
+  %45 = load double, ptr %4, align 8
+  %46 = load double, ptr %7, align 8
+  %47 = load double, ptr %13, align 8
+  %48 = load double, ptr %15, align 8
+  %49 = load double, ptr %17, align 8
+  %50 = load double, ptr %19, align 8
+  %51 = load double, ptr %18, align 8
+  %52 = load double, ptr %27, align 8
+  %53 = load double, ptr %23, align 8
+  %54 = load double, ptr %31, align 8
+  %55 = load double, ptr %33, align 8
+  %56 = load double, ptr %25, align 8
+  %57 = load double, ptr %16, align 8
+  %58 = fpext float %40 to double
+  %59 = fmul double %52, %58
+  %60 = fadd double %59, %51
+  %61 = fsub double %60, %48
+  %62 = fmul double 0.000000e+00, %36
+  %63 = fsub double %61, %62
+  %64 = fadd double %49, %63
+  %65 = fptrunc double %64 to float
+  %66 = fsub double 0.000000e+00, %34
+  %67 = fpext float %39 to double
+  %68 = fmul double %53, %67
+  %69 = fsub double %66, %68
+  %70 = fadd double %50, %69
+  %71 = fptrunc double %70 to float
+  store float 0.000000e+00, ptr %30, align 4
+  store float 0.000000e+00, ptr %26, align 4
+  %72 = getelementptr float, ptr %41, i64 %43
+  store float %38, ptr %72, align 4
+  store float %65, ptr %29, align 4
+  store float %71, ptr %14, align 4
+  store float %39, ptr %3, align 4
+  store float %39, ptr %11, align 4
+  %73 = fsub double %46, %44
+  %74 = fptrunc double %73 to float
+  %75 = fsub double %47, %45
+  %76 = fptrunc double %75 to float
+  %77 = fadd float %74, %76
+  %78 = fpext float %37 to double
+  %79 = fmul contract double %56, 0.000000e+00
+  %80 = fsub contract double %34, %79
+  %81 = fpext float %77 to double
+  %82 = fmul double %.pre, %81
+  %83 = fsub double %80, %82
+  %84 = fpext float %38 to double
+  %85 = fmul double %57, %84
+  %86 = fsub double %83, %85
+  %87 = fptrunc double %86 to float
+  %88 = fmul double %34, 0.000000e+00
+  %89 = fmul double %54, %78
+  %90 = fadd double %89, %88
+  %91 = fsub double %90, %55
+  %92 = fmul double 0.000000e+00, %35
+  %93 = fsub double %91, %92
+  %94 = fmul double %34, %34
+  %95 = fadd double %93, %94
+  %96 = fptrunc double %95 to float
+  store float %87, ptr %1, align 4
+  store float %37, ptr %21, align 4
+  store float %96, ptr %0, align 4
+  store float 0.000000e+00, ptr %9, align 4
+  store float 0.000000e+00, ptr %32, align 4
+  store float 0.000000e+00, ptr %20, align 4
+  store float 0.000000e+00, ptr %22, align 4
+  store float 0.000000e+00, ptr %5, align 4
+  store float 0.000000e+00, ptr %28, align 4
+  store float 0.000000e+00, ptr %12, align 4
+  store float 0.000000e+00, ptr %6, align 4
+  store float 0.000000e+00, ptr %8, align 4
+  store float 0.000000e+00, ptr %.sroa.1.0.copyload, align 4
+  store float %37, ptr %10, align 4
+  store float 0.000000e+00, ptr %24, align 4
+  br label %42
+}
diff --git a/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir b/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir
index ac595abf9269..7c21b3e08580 100644
--- a/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir
+++ b/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir
@@ -16,14 +16,17 @@ body: |
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_v64_to_v64
     ; GFX90A: liveins: $vgpr2_vgpr3
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_v64_to_v64
     ; GFX940: liveins: $vgpr2_vgpr3
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $vgpr2_vgpr3, implicit $exec, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_v64_to_v64
     ; GFX10: liveins: $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -43,14 +46,17 @@ body: |
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_s64_to_v64
     ; GFX90A: liveins: $sgpr2_sgpr3
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr2_sgpr3, 12, $sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_s64_to_v64
     ; GFX940: liveins: $sgpr2_sgpr3
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $sgpr2_sgpr3, implicit $exec, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_s64_to_v64
     ; GFX10: liveins: $sgpr2_sgpr3
     ; GFX10-NEXT: {{  $}}
@@ -70,16 +76,19 @@ body: |
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3
     ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_a64_to_v64
     ; GFX90A: liveins: $agpr2_agpr3
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3
     ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_a64_to_v64
     ; GFX940: liveins: $agpr2_agpr3
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3
     ; GFX940-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_a64_to_v64
     ; GFX10: liveins: $agpr2_agpr3
     ; GFX10-NEXT: {{  $}}
@@ -101,16 +110,19 @@ body: |
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_v128_to_v128_fwd
     ; GFX90A: liveins: $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX90A-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr4_vgpr5, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_v128_to_v128_fwd
     ; GFX940: liveins: $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX940-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr4_vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_v128_to_v128_fwd
     ; GFX10: liveins: $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -134,16 +146,19 @@ body: |
     ; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_v128_to_v128_back
     ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX90A-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr0_vgpr1, 12, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_v128_to_v128_back
     ; GFX940: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX940-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr0_vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_v128_to_v128_back
     ; GFX10: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -166,18 +181,21 @@ body: |
     ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_v96_to_v96
     ; GFX90A: liveins: $vgpr4_vgpr5_vgpr6
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6
     ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6
     ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_v96_to_v96
     ; GFX940: liveins: $vgpr4_vgpr5_vgpr6
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6
     ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6
     ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_v96_to_v96
     ; GFX10: liveins: $vgpr4_vgpr5_vgpr6
     ; GFX10-NEXT: {{  $}}
@@ -198,14 +216,17 @@ body: |
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_v64_to_v64_undef_sub0
     ; GFX90A: liveins: $vgpr3
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_v64_to_v64_undef_sub0
     ; GFX940: liveins: $vgpr3
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $vgpr2_vgpr3, implicit $exec, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_v64_to_v64_undef_sub0
     ; GFX10: liveins: $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -225,14 +246,17 @@ body: |
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_v64_to_v64_undef_sub1
     ; GFX90A: liveins: $vgpr2
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_v64_to_v64_undef_sub1
     ; GFX940: liveins: $vgpr2
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $vgpr2_vgpr3, implicit $exec, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_v64_to_v64_undef_sub1
     ; GFX10: liveins: $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -254,16 +278,19 @@ body: |
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7
+    ;
     ; GFX90A-LABEL: name: copy_s128_to_v128_killed
     ; GFX90A: liveins: $sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr4_sgpr5, 12, $sgpr4_sgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX90A-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $sgpr6_sgpr7, 12, $sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7
+    ;
     ; GFX940-LABEL: name: copy_s128_to_v128_killed
     ; GFX940: liveins: $sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr4_sgpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX940-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $sgpr6_sgpr7, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7
+    ;
     ; GFX10-LABEL: name: copy_s128_to_v128_killed
     ; GFX10: liveins: $sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX10-NEXT: {{  $}}
@@ -285,16 +312,19 @@ body: |
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_v64_to_v64_unaligned
     ; GFX90A: liveins: $vgpr2_vgpr3
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3
     ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_v64_to_v64_unaligned
     ; GFX940: liveins: $vgpr2_vgpr3
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3
     ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_v64_to_v64_unaligned
     ; GFX10: liveins: $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -314,16 +344,19 @@ body: |
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_v64_unaligned_to_v64
     ; GFX90A: liveins: $vgpr3_vgpr4
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4
     ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_v64_unaligned_to_v64
     ; GFX940: liveins: $vgpr3_vgpr4
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4
     ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_v64_unaligned_to_v64
     ; GFX10: liveins: $vgpr3_vgpr4
     ; GFX10-NEXT: {{  $}}
@@ -345,6 +378,7 @@ body: |
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
     ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
     ; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_v128_to_v128_unaligned
     ; GFX90A: liveins: $vgpr8_vgpr9_vgpr10_vgpr11
     ; GFX90A-NEXT: {{  $}}
@@ -352,6 +386,7 @@ body: |
     ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
     ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
     ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_v128_to_v128_unaligned
     ; GFX940: liveins: $vgpr8_vgpr9_vgpr10_vgpr11
     ; GFX940-NEXT: {{  $}}
@@ -359,6 +394,7 @@ body: |
     ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
     ; GFX940-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
     ; GFX940-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_v128_to_v128_unaligned
     ; GFX10: liveins: $vgpr8_vgpr9_vgpr10_vgpr11
     ; GFX10-NEXT: {{  $}}
@@ -382,6 +418,7 @@ body: |
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
     ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_v128_unaligned_to_v128
     ; GFX90A: liveins: $vgpr7_vgpr8_vgpr9_vgpr10
     ; GFX90A-NEXT: {{  $}}
@@ -389,6 +426,7 @@ body: |
     ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
     ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
     ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_v128_unaligned_to_v128
     ; GFX940: liveins: $vgpr7_vgpr8_vgpr9_vgpr10
     ; GFX940-NEXT: {{  $}}
@@ -396,6 +434,7 @@ body: |
     ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
     ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
     ; GFX940-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_v128_unaligned_to_v128
     ; GFX10: liveins: $vgpr7_vgpr8_vgpr9_vgpr10
     ; GFX10-NEXT: {{  $}}
@@ -417,16 +456,19 @@ body: |
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_s64_to_v64_unaligned
     ; GFX90A: liveins: $sgpr8_sgpr9
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9
     ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_s64_to_v64_unaligned
     ; GFX940: liveins: $sgpr8_sgpr9
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9
     ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_s64_to_v64_unaligned
     ; GFX10: liveins: $sgpr8_sgpr9
     ; GFX10-NEXT: {{  $}}
@@ -448,6 +490,7 @@ body: |
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
     ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
     ; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_s128_to_v128_unaligned
     ; GFX90A: liveins: $sgpr8_sgpr9_sgpr10_sgpr11
     ; GFX90A-NEXT: {{  $}}
@@ -455,6 +498,7 @@ body: |
     ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
     ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
     ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_s128_to_v128_unaligned
     ; GFX940: liveins: $sgpr8_sgpr9_sgpr10_sgpr11
     ; GFX940-NEXT: {{  $}}
@@ -462,6 +506,7 @@ body: |
     ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
     ; GFX940-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
     ; GFX940-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_s128_to_v128_unaligned
     ; GFX10: liveins: $sgpr8_sgpr9_sgpr10_sgpr11
     ; GFX10-NEXT: {{  $}}
@@ -484,18 +529,21 @@ body: |
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10
     ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_v96_to_v96_unaligned
     ; GFX90A: liveins: $vgpr8_vgpr9_vgpr10
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10
     ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10
     ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_v96_to_v96_unaligned
     ; GFX940: liveins: $vgpr8_vgpr9_vgpr10
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10
     ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10
     ; GFX940-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_v96_to_v96_unaligned
     ; GFX10: liveins: $vgpr8_vgpr9_vgpr10
     ; GFX10-NEXT: {{  $}}
@@ -517,18 +565,21 @@ body: |
     ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_v96_unaligned_to_v96
     ; GFX90A: liveins: $vgpr7_vgpr8_vgpr9
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9
     ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9
     ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_v96_unaligned_to_v96
     ; GFX940: liveins: $vgpr7_vgpr8_vgpr9
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9
     ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9
     ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_v96_unaligned_to_v96
     ; GFX10: liveins: $vgpr7_vgpr8_vgpr9
     ; GFX10-NEXT: {{  $}}
@@ -550,18 +601,21 @@ body: |
     ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_s96_to_v96
     ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2
     ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
     ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_s96_to_v96
     ; GFX940: liveins: $sgpr0_sgpr1_sgpr2
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2
     ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
     ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_s96_to_v96
     ; GFX10: liveins: $sgpr0_sgpr1_sgpr2
     ; GFX10-NEXT: {{  $}}
@@ -583,18 +637,21 @@ body: |
     ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_s96_to_v96_unaligned
     ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2
     ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
     ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_s96_to_v96_unaligned
     ; GFX940: liveins: $sgpr0_sgpr1_sgpr2
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2
     ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
     ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_s96_to_v96_unaligned
     ; GFX10: liveins: $sgpr0_sgpr1_sgpr2
     ; GFX10-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir b/llvm/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir
index 4d06f4a19597..004abb4bb0cc 100644
--- a/llvm/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir
+++ b/llvm/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir
@@ -18,20 +18,20 @@ body:             |
   ; GCN: bb.0:
   ; GCN-NEXT:   successors: %bb.1(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   %3:vgpr_32 = nofpexcept V_TRUNC_F32_e32 undef %4:vgpr_32, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %5:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 %3, implicit $mode, implicit $exec
-  ; GCN-NEXT:   [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 4, %5, implicit $exec
-  ; GCN-NEXT:   undef %11.sub0:vreg_128 = V_MUL_LO_I32_e64 [[V_LSHRREV_B32_e32_]], 3, implicit $exec
-  ; GCN-NEXT:   %11.sub3:vreg_128 = COPY %11.sub0
+  ; GCN-NEXT:   [[V_TRUNC_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_TRUNC_F32_e32 undef %4:vgpr_32, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_TRUNC_F32_e32_]], implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 4, [[V_CVT_U32_F32_e32_]], implicit $exec
+  ; GCN-NEXT:   undef [[V_MUL_LO_I32_e64_:%[0-9]+]].sub0:vreg_128 = V_MUL_LO_I32_e64 [[V_LSHRREV_B32_e32_]], 3, implicit $exec
+  ; GCN-NEXT:   [[V_MUL_LO_I32_e64_:%[0-9]+]].sub3:vreg_128 = COPY [[V_MUL_LO_I32_e64_]].sub0
   ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vreg_128 = COPY %11
-  ; GCN-NEXT:   %11.sub3:vreg_128 = V_ADD_U32_e32 target-flags(amdgpu-rel32-lo) 1, [[COPY]].sub3, implicit $exec
-  ; GCN-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32_xm0 = S_ADD_I32 [[S_ADD_I32_]], 1, implicit-def dead $scc
-  ; GCN-NEXT:   S_CMP_LT_U32 [[S_ADD_I32_]], 3, implicit-def $scc
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vreg_128 = COPY [[V_MUL_LO_I32_e64_]]
+  ; GCN-NEXT:   [[V_MUL_LO_I32_e64_:%[0-9]+]].sub3:vreg_128 = V_ADD_U32_e32 target-flags(amdgpu-rel32-lo) 1, [[COPY]].sub3, implicit $exec
+  ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_ADD_I32 [[S_MOV_B32_]], 1, implicit-def dead $scc
+  ; GCN-NEXT:   S_CMP_LT_U32 [[S_MOV_B32_]], 3, implicit-def $scc
   ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit killed $scc
   ; GCN-NEXT:   S_BRANCH %bb.2
   ; GCN-NEXT: {{  $}}
@@ -44,10 +44,10 @@ body:             |
   ; GCN-NEXT: bb.3:
   ; GCN-NEXT:   successors: %bb.4(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   dead %16:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]].sub3, undef %17:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from constant-pool, align 1, addrspace 4)
-  ; GCN-NEXT:   dead %18:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+  ; GCN-NEXT:   dead [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]].sub3, undef %17:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from constant-pool, align 1, addrspace 4)
+  ; GCN-NEXT:   dead [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
   ; GCN-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, -1, implicit-def dead $scc
-  ; GCN-NEXT:   dead %20:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   dead [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.4:
   ; GCN-NEXT:   successors: %bb.4(0x7c000000), %bb.6(0x04000000)
@@ -57,13 +57,13 @@ body:             |
   ; GCN-NEXT:   S_BRANCH %bb.6
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.5:
-  ; GCN-NEXT:   %21:vgpr_32 = nofpexcept V_MUL_F32_e32 target-flags(amdgpu-gotprel) 0, %11.sub0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %22:vgpr_32 = nofpexcept V_MIN_F32_e32 1106771968, %21, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %23:vgpr_32 = nnan arcp contract reassoc nofpexcept V_MAD_F32_e64 0, %22, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %24:vgpr_32 = nnan arcp contract reassoc nofpexcept V_MAD_F32_e64 0, %23, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %25:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %24, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %26:vgpr_32 = nofpexcept V_CVT_PKRTZ_F16_F32_e64 0, %25, 0, undef %27:vgpr_32, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   EXP_DONE 0, %26, undef %28:vgpr_32, undef %29:vgpr_32, undef %30:vgpr_32, -1, -1, 15, implicit $exec
+  ; GCN-NEXT:   [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 target-flags(amdgpu-gotprel) 0, [[V_MUL_LO_I32_e64_]].sub0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_MIN_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_F32_e32 1106771968, [[V_MUL_F32_e32_]], implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_MAD_F32_e64_:%[0-9]+]]:vgpr_32 = nnan arcp contract reassoc nofpexcept V_MAD_F32_e64 0, [[V_MIN_F32_e32_]], 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_MAD_F32_e64_1:%[0-9]+]]:vgpr_32 = nnan arcp contract reassoc nofpexcept V_MAD_F32_e64 0, [[V_MAD_F32_e64_]], 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_MAD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_MAD_F32_e64_1]], 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_CVT_PKRTZ_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_PKRTZ_F16_F32_e64 0, [[V_MAD_F32_e64_2]], 0, undef %27:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   EXP_DONE 0, [[V_CVT_PKRTZ_F16_F32_e64_]], undef %28:vgpr_32, undef %29:vgpr_32, undef %30:vgpr_32, -1, -1, 15, implicit $exec
   ; GCN-NEXT:   S_ENDPGM 0
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.6:
diff --git a/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir b/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir
index d284813c3684..00eb2b7e1aa8 100644
--- a/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir
+++ b/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir
@@ -57,37 +57,37 @@ body:             |
   ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef %11.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %17.sub0:vreg_64, %18:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF4]].sub0, [[DEF6]].sub0, 0, implicit $exec
-  ; CHECK-NEXT:   dead undef %17.sub1:vreg_64, dead %19:sreg_64_xexec = V_ADDC_U32_e64 [[DEF4]].sub1, [[DEF6]].sub1, %18, 0, implicit $exec
-  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF1]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
+  ; CHECK-NEXT:   undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub0:vreg_64, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF4]].sub0, [[DEF6]].sub0, 0, implicit $exec
+  ; CHECK-NEXT:   dead undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub1:vreg_64, dead [[V_ADDC_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[DEF4]].sub1, [[DEF6]].sub1, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF1]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
   ; CHECK-NEXT:   dead [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF3]]
   ; CHECK-NEXT:   dead [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF5]].sub1
-  ; CHECK-NEXT:   dead [[COPY6:%[0-9]+]]:vgpr_32 = COPY %11.sub0
+  ; CHECK-NEXT:   dead [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B32_e32_]].sub0
   ; CHECK-NEXT:   dead [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 4, [[DEF7]], implicit $exec
   ; CHECK-NEXT:   GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF8]], 288, 0, implicit $exec :: (store (s64), addrspace 1)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[DEF5]].sub1:vreg_64 = COPY [[COPY5]]
+  ; CHECK-NEXT:   undef [[DEF5:%[0-9]+]].sub1:vreg_64 = COPY [[COPY5]]
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead [[COPY8:%[0-9]+]]:sreg_64 = COPY $exec
+  ; CHECK-NEXT:   dead [[COPY7:%[0-9]+]]:sreg_64 = COPY $exec
   ; CHECK-NEXT:   dead [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY1]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
   ; CHECK-NEXT:   DBG_VALUE [[GLOBAL_LOAD_DWORDX4_]], $noreg, <0x{{[0-9a-f]+}}>, !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), debug-location !DILocation(line: 0, scope: <0x{{[0-9a-f]+}}>)
   ; CHECK-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/dead_bundle.mir b/llvm/test/CodeGen/AMDGPU/dead_bundle.mir
index 19e50203c191..dd9d6a1c788e 100644
--- a/llvm/test/CodeGen/AMDGPU/dead_bundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/dead_bundle.mir
@@ -24,9 +24,9 @@ body:             |
     ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_BUFFER_LOAD_DWORDX8_IMM undef renamable $sgpr0_sgpr1_sgpr2_sgpr3, 416, 0 :: (dereferenceable invariant load (s256), align 4)
     ; CHECK-NEXT: renamable $sgpr3 = COPY killed renamable $sgpr7
     ; CHECK-NEXT: renamable $sgpr5 = COPY renamable $sgpr9
-    ; CHECK-NEXT: dead undef %4.sub0:vreg_64 = COPY renamable $sgpr3
-    ; CHECK-NEXT: dead undef %7.sub1:vreg_64 = COPY killed renamable $sgpr5
-    ; CHECK-NEXT: dead [[IMAGE_SAMPLE_V1_V2_gfx11_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx11 undef %4, undef renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
+    ; CHECK-NEXT: dead undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY renamable $sgpr3
+    ; CHECK-NEXT: dead undef [[COPY1:%[0-9]+]].sub1:vreg_64 = COPY killed renamable $sgpr5
+    ; CHECK-NEXT: dead [[IMAGE_SAMPLE_V1_V2_gfx11_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx11 undef [[COPY]], undef renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
     ; CHECK-NEXT: S_ENDPGM 0
     undef %8.sub3:sgpr_128 = IMPLICIT_DEF
     undef %8.sub1:sgpr_128 = COPY undef $sgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir
index 66276c756db4..cdd4c72f3717 100644
--- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir
+++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir
@@ -61,8 +61,8 @@ body:             |
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[V_MUL_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec
-  ; CHECK-NEXT:   dead %23:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MUL_F32_e32_4]], [[DEF13]], implicit $mode, implicit $exec
-  ; CHECK-NEXT:   dead [[V_MOV_B32_e32_1]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[V_ADD_F32_e32_]], [[COPY]], [[V_MOV_B32_e32_1]], implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead [[V_MUL_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MUL_F32_e32_4]], [[DEF13]], implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[V_ADD_F32_e32_]], [[COPY]], [[V_MOV_B32_e32_1]], implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   $sgpr4 = IMPLICIT_DEF
   ; CHECK-NEXT:   $vgpr0 = COPY [[DEF11]]
@@ -73,10 +73,10 @@ body:             |
   ; CHECK-NEXT:   $vgpr2 = COPY [[V_MUL_F32_e32_3]]
   ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL [[DEF14]], @foo, csr_amdgpu, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $sgpr4, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit-def $vgpr0
   ; CHECK-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MUL_F32_e32_]], [[DEF8]], implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MAC_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[DEF12]], [[DEF9]], [[V_MAC_F32_e32_]], implicit $mode, implicit $exec
-  ; CHECK-NEXT:   dead %26:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_MAC_F32_e32_]], 0, [[DEF4]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   dead %27:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_MAC_F32_e32_]], 0, [[DEF5]], 0, [[DEF2]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   dead %28:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_MAC_F32_e32_]], 0, [[DEF6]], 0, [[DEF3]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[DEF12]], [[DEF9]], [[V_ADD_F32_e32_1]], implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead [[V_MAD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF4]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead [[V_MAD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF5]], 0, [[DEF2]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead [[V_MAD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF6]], 0, [[DEF3]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   GLOBAL_STORE_DWORD [[DEF]], [[DEF10]], 0, 0, implicit $exec
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir b/llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir
index 41b85ce5c6e0..760ae6032230 100644
--- a/llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir
+++ b/llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir
@@ -29,33 +29,33 @@ body:             |
   ; CHECK-NEXT:   dead [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit [[DEF1]]
   ; CHECK-NEXT:   S_NOP 0, implicit [[DEF1]]
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %6.sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16:av_1024_align2 = COPY [[COPY]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16 {
-  ; CHECK-NEXT:     internal %6.sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16:av_1024_align2 = COPY [[COPY]].sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16
-  ; CHECK-NEXT:     internal %6.sub29_sub30_sub31:av_1024_align2 = COPY [[COPY]].sub29_sub30_sub31
+  ; CHECK-NEXT:   undef [[COPY1:%[0-9]+]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16:av_1024_align2 = COPY [[COPY]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16 {
+  ; CHECK-NEXT:     internal [[COPY1]].sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16:av_1024_align2 = COPY [[COPY]].sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16
+  ; CHECK-NEXT:     internal [[COPY1]].sub29_sub30_sub31:av_1024_align2 = COPY [[COPY]].sub29_sub30_sub31
   ; CHECK-NEXT:   }
-  ; CHECK-NEXT:   %6.sub0:av_1024_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   S_NOP 0, implicit %6.sub0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]].sub0:av_1024_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit [[COPY1]].sub0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_NOP 0, implicit %6
+  ; CHECK-NEXT:   S_NOP 0, implicit [[COPY1]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:av_1024_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:av_1024_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit undef $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   undef %4.sub0:vreg_1024_align2 = COPY [[DEF]]
-  ; CHECK-NEXT:   S_NOP 0, implicit %4
+  ; CHECK-NEXT:   undef [[COPY2:%[0-9]+]].sub0:vreg_1024_align2 = COPY [[DEF]]
+  ; CHECK-NEXT:   S_NOP 0, implicit [[COPY2]]
   bb.0:
     %0:vgpr_32 = IMPLICIT_DEF
     %1:vreg_1024_align2 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll b/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll
index cf610f9436ac..2d6ae31f8e58 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll
@@ -11,24 +11,24 @@ define float @fdiv_f32(float %a, float %b) #0 {
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GCN-NEXT:   %4:vgpr_32, %5:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %6:vgpr_32, %7:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %8:vgpr_32 = nofpexcept V_RCP_F32_e64 0, %6, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_DIV_SCALE_F32_e64_:%[0-9]+]]:vgpr_32, [[V_DIV_SCALE_F32_e64_1:%[0-9]+]]:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_DIV_SCALE_F32_e64_2:%[0-9]+]]:vgpr_32, [[V_DIV_SCALE_F32_e64_3:%[0-9]+]]:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e64 0, [[V_DIV_SCALE_F32_e64_2]], 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3
   ; GCN-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216
   ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GCN-NEXT:   S_SETREG_B32_mode killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode
-  ; GCN-NEXT:   %12:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %6, 0, %8, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %13:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %12, 0, %8, 0, %8, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %14:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %4, 0, %13, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %15:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %6, 0, %14, 0, %4, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %16:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %15, 0, %13, 0, %14, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %17:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %6, 0, %16, 0, %4, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_FMA_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 1, [[V_DIV_SCALE_F32_e64_2]], 0, [[V_RCP_F32_e64_]], 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_FMA_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed [[V_FMA_F32_e64_]], 0, [[V_RCP_F32_e64_]], 0, [[V_RCP_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[V_DIV_SCALE_F32_e64_]], 0, [[V_FMA_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_FMA_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 1, [[V_DIV_SCALE_F32_e64_2]], 0, [[V_MUL_F32_e64_]], 0, [[V_DIV_SCALE_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_FMA_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed [[V_FMA_F32_e64_2]], 0, [[V_FMA_F32_e64_1]], 0, [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_FMA_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 1, [[V_DIV_SCALE_F32_e64_2]], 0, [[V_FMA_F32_e64_3]], 0, [[V_DIV_SCALE_F32_e64_]], 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   S_SETREG_B32_mode killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode
-  ; GCN-NEXT:   $vcc = COPY %5
-  ; GCN-NEXT:   %18:vgpr_32 = nofpexcept V_DIV_FMAS_F32_e64 0, killed %17, 0, %13, 0, %16, 0, 0, implicit $mode, implicit $vcc, implicit $exec
-  ; GCN-NEXT:   %19:vgpr_32 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed %18, 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vgpr0 = COPY %19
+  ; GCN-NEXT:   $vcc = COPY [[V_DIV_SCALE_F32_e64_1]]
+  ; GCN-NEXT:   [[V_DIV_FMAS_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_DIV_FMAS_F32_e64 0, killed [[V_FMA_F32_e64_4]], 0, [[V_FMA_F32_e64_1]], 0, [[V_FMA_F32_e64_3]], 0, 0, implicit $mode, implicit $vcc, implicit $exec
+  ; GCN-NEXT:   [[V_DIV_FIXUP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed [[V_DIV_FMAS_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   $vgpr0 = COPY [[V_DIV_FIXUP_F32_e64_]]
   ; GCN-NEXT:   SI_RETURN implicit $vgpr0
 entry:
   %fdiv = fdiv float %a, %b
@@ -42,24 +42,24 @@ define float @fdiv_nnan_f32(float %a, float %b) #0 {
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GCN-NEXT:   %4:vgpr_32, %5:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %6:vgpr_32, %7:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %8:vgpr_32 = nnan nofpexcept V_RCP_F32_e64 0, %6, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_DIV_SCALE_F32_e64_:%[0-9]+]]:vgpr_32, [[V_DIV_SCALE_F32_e64_1:%[0-9]+]]:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_DIV_SCALE_F32_e64_2:%[0-9]+]]:vgpr_32, [[V_DIV_SCALE_F32_e64_3:%[0-9]+]]:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = nnan nofpexcept V_RCP_F32_e64 0, [[V_DIV_SCALE_F32_e64_2]], 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3
   ; GCN-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216
   ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GCN-NEXT:   S_SETREG_B32_mode killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode
-  ; GCN-NEXT:   %12:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %6, 0, %8, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %13:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %12, 0, %8, 0, %8, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %14:vgpr_32 = nnan nofpexcept V_MUL_F32_e64 0, %4, 0, %13, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %15:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %6, 0, %14, 0, %4, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %16:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %15, 0, %13, 0, %14, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %17:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %6, 0, %16, 0, %4, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_FMA_F32_e64_:%[0-9]+]]:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, [[V_DIV_SCALE_F32_e64_2]], 0, [[V_RCP_F32_e64_]], 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_FMA_F32_e64_1:%[0-9]+]]:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed [[V_FMA_F32_e64_]], 0, [[V_RCP_F32_e64_]], 0, [[V_RCP_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nnan nofpexcept V_MUL_F32_e64 0, [[V_DIV_SCALE_F32_e64_]], 0, [[V_FMA_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_FMA_F32_e64_2:%[0-9]+]]:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, [[V_DIV_SCALE_F32_e64_2]], 0, [[V_MUL_F32_e64_]], 0, [[V_DIV_SCALE_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_FMA_F32_e64_3:%[0-9]+]]:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed [[V_FMA_F32_e64_2]], 0, [[V_FMA_F32_e64_1]], 0, [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_FMA_F32_e64_4:%[0-9]+]]:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, [[V_DIV_SCALE_F32_e64_2]], 0, [[V_FMA_F32_e64_3]], 0, [[V_DIV_SCALE_F32_e64_]], 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   S_SETREG_B32_mode killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode
-  ; GCN-NEXT:   $vcc = COPY %5
-  ; GCN-NEXT:   %18:vgpr_32 = nnan nofpexcept V_DIV_FMAS_F32_e64 0, killed %17, 0, %13, 0, %16, 0, 0, implicit $mode, implicit $vcc, implicit $exec
-  ; GCN-NEXT:   %19:vgpr_32 = nnan nofpexcept V_DIV_FIXUP_F32_e64 0, killed %18, 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vgpr0 = COPY %19
+  ; GCN-NEXT:   $vcc = COPY [[V_DIV_SCALE_F32_e64_1]]
+  ; GCN-NEXT:   [[V_DIV_FMAS_F32_e64_:%[0-9]+]]:vgpr_32 = nnan nofpexcept V_DIV_FMAS_F32_e64 0, killed [[V_FMA_F32_e64_4]], 0, [[V_FMA_F32_e64_1]], 0, [[V_FMA_F32_e64_3]], 0, 0, implicit $mode, implicit $vcc, implicit $exec
+  ; GCN-NEXT:   [[V_DIV_FIXUP_F32_e64_:%[0-9]+]]:vgpr_32 = nnan nofpexcept V_DIV_FIXUP_F32_e64 0, killed [[V_DIV_FMAS_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   $vgpr0 = COPY [[V_DIV_FIXUP_F32_e64_]]
   ; GCN-NEXT:   SI_RETURN implicit $vgpr0
 entry:
   %fdiv = fdiv nnan float %a, %b
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll
index 42274e5420d0..ce4beb8789dc 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll
@@ -14,6 +14,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da
   ; GFX940-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
   ; GFX940-NEXT:   FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
   ; GFX940-NEXT:   S_ENDPGM 0
+  ;
   ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic
   ; GFX11: bb.0 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -42,6 +43,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data
   ; GFX940-NEXT:   [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
   ; GFX940-NEXT:   $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
   ; GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX11-LABEL: name: flat_atomic_fadd_f32_rtn_intrinsic
   ; GFX11: bb.0 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -70,6 +72,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(ptr %ptr, float %da
   ; GFX940-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
   ; GFX940-NEXT:   FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr)
   ; GFX940-NEXT:   S_ENDPGM 0
+  ;
   ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_atomicrmw
   ; GFX11: bb.0 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -98,6 +101,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_atomicrmw(ptr %ptr, float %data
   ; GFX940-NEXT:   [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr)
   ; GFX940-NEXT:   $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
   ; GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX11-LABEL: name: flat_atomic_fadd_f32_rtn_atomicrmw
   ; GFX11: bb.0 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 687d84565692..850be72f06c7 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -893,6 +893,7 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
@@ -964,6 +965,7 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, s32 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
@@ -2137,6 +2139,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX12-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 offset:256 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
@@ -2221,6 +2224,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, s32 offset:256 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
@@ -3382,6 +3386,7 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX12-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 offset:16384 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
@@ -3468,6 +3473,7 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, s32 offset:16384 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
@@ -3714,6 +3720,7 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 off, v0, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS
@@ -3789,6 +3796,7 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s32 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS
@@ -3998,6 +4006,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -4055,6 +4064,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -4107,6 +4117,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -4164,6 +4175,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -4220,6 +4232,7 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-NEXT:    v_mov_b32_e32 v3, 3
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
@@ -4282,6 +4295,7 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v3, 3
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
@@ -4340,6 +4354,7 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-NEXT:    v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
@@ -4405,6 +4420,7 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
@@ -4456,6 +4472,7 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS
@@ -4523,6 +4540,7 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
 ; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS
@@ -4576,6 +4594,7 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS
@@ -4644,6 +4663,7 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
 ; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index b261b3129f3f..7c5f6d5e33ef 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -2532,58 +2532,16 @@ define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v0, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
-; VI-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; VI-NEXT:    v_ffbh_i32_e32 v3, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 32, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, -1, v3
-; VI-NEXT:    v_min_u32_e32 v2, v3, v2
-; VI-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
-; VI-NEXT:    v_min_u32_e32 v0, 1, v0
-; VI-NEXT:    v_or_b32_e32 v0, v1, v0
-; VI-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
-; VI-NEXT:    v_ldexp_f32 v0, v0, v1
-; VI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, -0.5
-; VI-NEXT:    v_div_scale_f32 v2, vcc, -0.5, v0, -0.5
-; VI-NEXT:    v_rcp_f32_e32 v3, v1
-; VI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
-; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
-; VI-NEXT:    v_mul_f32_e32 v4, v2, v3
-; VI-NEXT:    v_fma_f32 v5, -v1, v4, v2
-; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
-; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
-; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, -0.5
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
+; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0xbd800000, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fdiv_pow_shl_cnt:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 31, v0
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
-; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; GFX10-NEXT:    v_ffbh_i32_e32 v3, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, 32, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, -1, v3
-; GFX10-NEXT:    v_min_u32_e32 v2, v3, v2
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
-; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX10-NEXT:    v_div_scale_f32 v1, s4, v0, v0, -0.5
-; GFX10-NEXT:    v_rcp_f32_e32 v2, v1
-; GFX10-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
-; GFX10-NEXT:    v_fmac_f32_e32 v2, v3, v2
-; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, -0.5, v0, -0.5
-; GFX10-NEXT:    v_mul_f32_e32 v4, v3, v2
-; GFX10-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GFX10-NEXT:    v_fmac_f32_e32 v4, v5, v2
-; GFX10-NEXT:    v_fma_f32 v1, -v1, v4, v3
-; GFX10-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
-; GFX10-NEXT:    v_div_fixup_f32 v0, v1, v0, -0.5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0xbd800000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fdiv_pow_shl_cnt:
@@ -2591,39 +2549,8 @@ define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 31, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
-; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cls_i32_e32 v3, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 32, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, -1, v3
-; GFX11-NEXT:    v_min_u32_e32 v2, v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
-; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX11-NEXT:    v_div_scale_f32 v1, null, v0, v0, -0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v2, v1
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
-; GFX11-NEXT:    v_fmac_f32_e32 v2, v3, v2
-; GFX11-NEXT:    v_div_scale_f32 v3, vcc_lo, -0.5, v0, -0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v4, v3, v2
-; GFX11-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v4, v5, v2
-; GFX11-NEXT:    v_fma_f32 v1, -v1, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
-; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, -0.5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
+; GFX11-NEXT:    v_sub_nc_u32_e32 v0, 0xbd800000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cnt = and i64 %cnt_in, 31
   %shl = shl i64 8, %cnt
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll
index 08c58f21e76a..666971618a5c 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll
@@ -15,6 +15,7 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_intrinsic(ptr addrspace(1
   ; GFX908-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
   ; GFX908-NEXT:   GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -42,6 +43,7 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic(ptr addrs
   ; GFX908-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GFX908-NEXT:   GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
@@ -69,6 +71,7 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat_intrinsic(ptr addrsp
   ; GFX908-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
   ; GFX908-NEXT:   GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -96,6 +99,7 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic(ptr
   ; GFX908-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GFX908-NEXT:   GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir
index f59c42283e98..bdd89a907790 100644
--- a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir
@@ -85,7 +85,7 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75
   ; CHECK-NEXT:   renamable $sgpr6 = S_LSHL_B32 renamable $sgpr67, 1, implicit-def dead $scc
-  ; CHECK-NEXT:   dead [[V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32_:%[0-9]+]]:vreg_1024 = V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 [[V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32_]], 0, killed $sgpr6, 3, implicit-def $m0, implicit $m0, implicit $exec
+  ; CHECK-NEXT:   dead [[COPY:%[0-9]+]]:vreg_1024 = V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 [[COPY]], 0, killed $sgpr6, 3, implicit-def $m0, implicit $m0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.1(0x40000000)
@@ -114,7 +114,7 @@ body:             |
   ; CHECK-NEXT:   renamable $sgpr87 = COPY renamable $sgpr44
   ; CHECK-NEXT:   renamable $sgpr88 = COPY renamable $sgpr44
   ; CHECK-NEXT:   renamable $sgpr89 = COPY renamable $sgpr44
-  ; CHECK-NEXT:   dead [[COPY:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99, implicit $exec
+  ; CHECK-NEXT:   dead [[COPY1:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99, implicit $exec
   ; CHECK-NEXT:   $exec = S_XOR_B64_term $exec, killed renamable $sgpr6_sgpr7, implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.5, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.1
@@ -125,7 +125,7 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr6_sgpr7, implicit-def $scc
   ; CHECK-NEXT:   dead renamable $sgpr4 = S_LSHL_B32 killed renamable $sgpr66, 1, implicit-def dead $scc
-  ; CHECK-NEXT:   dead [[COPY1:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75
+  ; CHECK-NEXT:   dead [[COPY2:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir b/llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir
index edc04f2ef39e..742498cdd8bd 100644
--- a/llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir
+++ b/llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir
@@ -23,12 +23,12 @@ body:             |
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %3:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1)
     ; CHECK-NEXT: SI_SPILL_V64_SAVE [[GLOBAL_LOAD_DWORDX2_SADDR1]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 8, 0, implicit $exec :: (load (s64), addrspace 1)
-    ; CHECK-NEXT: undef %9.sub1:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1
-    ; CHECK-NEXT: S_NOP 0, implicit %9.sub1
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY]].sub1
     ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
     ; CHECK-NEXT: S_NOP 0, implicit [[SI_SPILL_V64_RESTORE]].sub0
-    ; CHECK-NEXT: undef %7.sub1:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR2]].sub1
-    ; CHECK-NEXT: S_NOP 0, implicit %7.sub1
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub1:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR2]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]].sub1
     ; CHECK-NEXT: S_ENDPGM 0
     %1:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %4:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1)
     %2:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1)
@@ -66,18 +66,18 @@ body:             |
     ; CHECK-NEXT: SI_SPILL_V64_SAVE [[GLOBAL_LOAD_DWORDX2_SADDR2]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
     ; CHECK-NEXT: S_NOP 0, implicit-def [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0
     ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.1, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %13.sub0:vreg_64 = COPY [[SI_SPILL_V64_RESTORE]].sub0
-    ; CHECK-NEXT: S_NOP 0, implicit-def %13.sub1
-    ; CHECK-NEXT: undef %15.sub0:vreg_64 = COPY %13.sub0
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY [[SI_SPILL_V64_RESTORE]].sub0
+    ; CHECK-NEXT: S_NOP 0, implicit-def [[COPY]].sub1
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_64 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE1:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %7.sub1:vreg_64 = COPY [[SI_SPILL_V64_RESTORE1]].sub1
-    ; CHECK-NEXT: S_NOP 0, implicit-def %7.sub0
-    ; CHECK-NEXT: undef %9.sub1:vreg_64 = COPY %7.sub1
+    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub1:vreg_64 = COPY [[SI_SPILL_V64_RESTORE1]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit-def [[COPY2]].sub0
+    ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub1:vreg_64 = COPY [[COPY2]].sub1
     ; CHECK-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1
-    ; CHECK-NEXT: undef %14.sub0:vreg_64 = COPY %15.sub0
-    ; CHECK-NEXT: S_NOP 0, implicit %14.sub0
-    ; CHECK-NEXT: undef %8.sub1:vreg_64 = COPY %9.sub1
-    ; CHECK-NEXT: S_NOP 0, implicit %8.sub1
+    ; CHECK-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_64 = COPY [[COPY1]].sub0
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY4]].sub0
+    ; CHECK-NEXT: undef [[COPY5:%[0-9]+]].sub1:vreg_64 = COPY [[COPY3]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY5]].sub1
     ; CHECK-NEXT: S_ENDPGM 0
     %1:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %4:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1)
     %2:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/gws-hazards.mir b/llvm/test/CodeGen/AMDGPU/gws-hazards.mir
index 50cd4ba09184..1eeb0f453bb1 100644
--- a/llvm/test/CodeGen/AMDGPU/gws-hazards.mir
+++ b/llvm/test/CodeGen/AMDGPU/gws-hazards.mir
@@ -19,22 +19,26 @@ body: |
     ; GFX9-NEXT: $m0 = S_MOV_B32 -1
     ; GFX9-NEXT: S_NOP 0
     ; GFX9-NEXT: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec
+    ;
     ; VI-LABEL: name: m0_gws_init0
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: $m0 = S_MOV_B32 -1
     ; VI-NEXT: S_NOP 0
     ; VI-NEXT: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec
+    ;
     ; CI-LABEL: name: m0_gws_init0
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: $m0 = S_MOV_B32 -1
     ; CI-NEXT: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec
+    ;
     ; SI-LABEL: name: m0_gws_init0
     ; SI: liveins: $vgpr0
     ; SI-NEXT: {{  $}}
     ; SI-NEXT: $m0 = S_MOV_B32 -1
     ; SI-NEXT: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec
+    ;
     ; GFX10-LABEL: name: m0_gws_init0
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -56,19 +60,23 @@ body: |
     ; GFX9-NEXT: $m0 = S_MOV_B32 -1
     ; GFX9-NEXT: S_NOP 0
     ; GFX9-NEXT: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec
+    ;
     ; VI-LABEL: name: m0_gws_init1
     ; VI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     ; VI-NEXT: $m0 = S_MOV_B32 -1
     ; VI-NEXT: S_NOP 0
     ; VI-NEXT: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec
+    ;
     ; CI-LABEL: name: m0_gws_init1
     ; CI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     ; CI-NEXT: $m0 = S_MOV_B32 -1
     ; CI-NEXT: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec
+    ;
     ; SI-LABEL: name: m0_gws_init1
     ; SI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     ; SI-NEXT: $m0 = S_MOV_B32 -1
     ; SI-NEXT: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec
+    ;
     ; GFX10-LABEL: name: m0_gws_init1
     ; GFX10: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     ; GFX10-NEXT: $m0 = S_MOV_B32 -1
@@ -96,6 +104,7 @@ body: |
     ; GFX9-NEXT: $m0 = S_MOV_B32 $sgpr0
     ; GFX9-NEXT: S_NOP 0
     ; GFX9-NEXT: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec
+    ;
     ; VI-LABEL: name: m0_gws_readlane
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -103,18 +112,21 @@ body: |
     ; VI-NEXT: $m0 = S_MOV_B32 $sgpr0
     ; VI-NEXT: S_NOP 0
     ; VI-NEXT: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec
+    ;
     ; CI-LABEL: name: m0_gws_readlane
     ; CI: liveins: $vgpr0, $vgpr1
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
     ; CI-NEXT: $m0 = S_MOV_B32 $sgpr0
     ; CI-NEXT: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec
+    ;
     ; SI-LABEL: name: m0_gws_readlane
     ; SI: liveins: $vgpr0, $vgpr1
     ; SI-NEXT: {{  $}}
     ; SI-NEXT: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
     ; SI-NEXT: $m0 = S_MOV_B32 $sgpr0
     ; SI-NEXT: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec
+    ;
     ; GFX10-LABEL: name: m0_gws_readlane
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/high-bits-zeroed-16-bit-ops.mir b/llvm/test/CodeGen/AMDGPU/high-bits-zeroed-16-bit-ops.mir
index 6556f9b46707..5596eceb95d0 100644
--- a/llvm/test/CodeGen/AMDGPU/high-bits-zeroed-16-bit-ops.mir
+++ b/llvm/test/CodeGen/AMDGPU/high-bits-zeroed-16-bit-ops.mir
@@ -20,6 +20,7 @@ body:             |
     ; GFX8-NEXT: $vgpr0 = COPY %op
     ; GFX8-NEXT: $vgpr1 = COPY %op
     ; GFX8-NEXT: $vgpr2 = COPY %op
+    ;
     ; GFX9-LABEL: name: v_cvt_f16_f32_altmask
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -30,6 +31,7 @@ body:             |
     ; GFX9-NEXT: $vgpr0 = COPY %op
     ; GFX9-NEXT: $vgpr1 = COPY %op
     ; GFX9-NEXT: $vgpr2 = COPY %op
+    ;
     ; GFX10-LABEL: name: v_cvt_f16_f32_altmask
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -69,6 +71,7 @@ body:             |
     ; GFX8-NEXT: %mask:sreg_32 = S_MOV_B32 65534
     ; GFX8-NEXT: %and:vgpr_32 = V_AND_B32_e64 %mask, %op, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %and
+    ;
     ; GFX9-LABEL: name: wrong_mask_value
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -77,6 +80,7 @@ body:             |
     ; GFX9-NEXT: %mask:sreg_32 = S_MOV_B32 65534
     ; GFX9-NEXT: %and:vgpr_32 = V_AND_B32_e64 %mask, %op, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %and
+    ;
     ; GFX10-LABEL: name: wrong_mask_value
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -106,6 +110,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_cvt_f16_f32
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -114,6 +119,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_cvt_f16_f32
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -148,6 +154,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_CVT_F16_U16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_cvt_f16_u16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -156,6 +163,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_CVT_F16_U16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_cvt_f16_u16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -190,6 +198,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_CVT_F16_I16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_cvt_f16_i16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -198,6 +207,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_CVT_F16_I16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_cvt_f16_i16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -232,6 +242,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_RCP_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_rcp_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -240,6 +251,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_RCP_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_rcp_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -273,6 +285,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_RSQ_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_rsq_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -281,6 +294,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_RSQ_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_rsq_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -314,6 +328,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_SQRT_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_sqrt_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -322,6 +337,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_SQRT_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_sqrt_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -356,6 +372,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_LOG_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_log_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -364,6 +381,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_LOG_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_log_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -398,6 +416,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_EXP_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_exp_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -406,6 +425,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_EXP_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_exp_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -440,6 +460,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_SIN_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_sin_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -448,6 +469,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_SIN_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_sin_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -482,6 +504,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_COS_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_cos_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -490,6 +513,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_COS_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_cos_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -524,6 +548,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_FLOOR_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_floor_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -532,6 +557,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_FLOOR_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_floor_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -566,6 +592,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_CEIL_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_ceil_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -574,6 +601,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_CEIL_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_ceil_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -608,6 +636,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_TRUNC_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_trunc_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -616,6 +645,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_TRUNC_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_trunc_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -650,6 +680,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_RNDNE_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_rndne_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -658,6 +689,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_RNDNE_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_rndne_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -692,6 +724,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_FRACT_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_fract_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -700,6 +733,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_FRACT_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_fract_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -734,6 +768,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_FREXP_MANT_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_frexp_mant_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -742,6 +777,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_FREXP_MANT_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_frexp_mant_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -776,6 +812,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_FREXP_EXP_I16_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_frexp_exp_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -784,6 +821,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_FREXP_EXP_I16_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_frexp_exp_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -819,6 +857,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_LDEXP_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_ldexp_f16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -828,6 +867,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_LDEXP_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_ldexp_f16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -865,6 +905,7 @@ body:             |
     ; GFX8-NEXT: %op_vop2:vgpr_32 = nofpexcept V_LSHLREV_B16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX9-LABEL: name: v_lshlrev_b16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -874,6 +915,7 @@ body:             |
     ; GFX9-NEXT: %op_vop2:vgpr_32 = nofpexcept V_LSHLREV_B16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX10-LABEL: name: v_lshlrev_b16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -911,6 +953,7 @@ body:             |
     ; GFX8-NEXT: %op_vop2:vgpr_32 = nofpexcept V_LSHRREV_B16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX9-LABEL: name: v_lshrrev_b16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -920,6 +963,7 @@ body:             |
     ; GFX9-NEXT: %op_vop2:vgpr_32 = nofpexcept V_LSHRREV_B16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX10-LABEL: name: v_lshrrev_b16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -957,6 +1001,7 @@ body:             |
     ; GFX8-NEXT: %op_vop2:vgpr_32 = nofpexcept V_ASHRREV_I16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX9-LABEL: name: v_ashrrev_i16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -966,6 +1011,7 @@ body:             |
     ; GFX9-NEXT: %op_vop2:vgpr_32 = nofpexcept V_ASHRREV_I16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX10-LABEL: name: v_ashrrev_i16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1003,6 +1049,7 @@ body:             |
     ; GFX8-NEXT: %op_vop2:vgpr_32 = nofpexcept V_ADD_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX9-LABEL: name: v_add_u16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1012,6 +1059,7 @@ body:             |
     ; GFX9-NEXT: %op_vop2:vgpr_32 = nofpexcept V_ADD_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX10-LABEL: name: v_add_u16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1049,6 +1097,7 @@ body:             |
     ; GFX8-NEXT: %op_vop2:vgpr_32 = nofpexcept V_SUB_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX9-LABEL: name: v_sub_u16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1058,6 +1107,7 @@ body:             |
     ; GFX9-NEXT: %op_vop2:vgpr_32 = nofpexcept V_SUB_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX10-LABEL: name: v_sub_u16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1095,6 +1145,7 @@ body:             |
     ; GFX8-NEXT: %op_vop2:vgpr_32 = nofpexcept V_SUBREV_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX9-LABEL: name: v_subrev_u16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1104,6 +1155,7 @@ body:             |
     ; GFX9-NEXT: %op_vop2:vgpr_32 = nofpexcept V_SUBREV_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX10-LABEL: name: v_subrev_u16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1141,6 +1193,7 @@ body:             |
     ; GFX8-NEXT: %op_vop2:vgpr_32 = nofpexcept V_MUL_LO_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX9-LABEL: name: v_mul_lo_u16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1150,6 +1203,7 @@ body:             |
     ; GFX9-NEXT: %op_vop2:vgpr_32 = nofpexcept V_MUL_LO_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX10-LABEL: name: v_mul_lo_u16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1187,6 +1241,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_add_f16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1196,6 +1251,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_add_f16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1233,6 +1289,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_sub_f16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1242,6 +1299,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_sub_f16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1279,6 +1337,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_subrev_f16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1288,6 +1347,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_subrev_f16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1325,6 +1385,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_mul_f16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1334,6 +1395,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_mul_f16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1371,6 +1433,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_max_f16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1380,6 +1443,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_max_f16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1417,6 +1481,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_min_f16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1426,6 +1491,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_min_f16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1463,6 +1529,7 @@ body:             |
     ; GFX8-NEXT: %op_vop2:vgpr_32 = nofpexcept V_MAX_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX9-LABEL: name: v_max_u16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1472,6 +1539,7 @@ body:             |
     ; GFX9-NEXT: %op_vop2:vgpr_32 = nofpexcept V_MAX_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX10-LABEL: name: v_max_u16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1508,6 +1576,7 @@ body:             |
     ; GFX8-NEXT: %op_vop2:vgpr_32 = nofpexcept V_MIN_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX9-LABEL: name: v_min_u16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1517,6 +1586,7 @@ body:             |
     ; GFX9-NEXT: %op_vop2:vgpr_32 = nofpexcept V_MIN_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX10-LABEL: name: v_min_u16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1554,6 +1624,7 @@ body:             |
     ; GFX8-NEXT: %op_vop2:vgpr_32 = nofpexcept V_MAX_I16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX9-LABEL: name: v_max_i16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1563,6 +1634,7 @@ body:             |
     ; GFX9-NEXT: %op_vop2:vgpr_32 = nofpexcept V_MAX_I16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX10-LABEL: name: v_max_i16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1599,6 +1671,7 @@ body:             |
     ; GFX8-NEXT: %op_vop2:vgpr_32 = nofpexcept V_MIN_I16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX9-LABEL: name: v_min_i16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1608,6 +1681,7 @@ body:             |
     ; GFX9-NEXT: %op_vop2:vgpr_32 = nofpexcept V_MIN_I16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX10-LABEL: name: v_min_i16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1644,6 +1718,7 @@ body:             |
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: %op:vgpr_32 = nofpexcept V_MAD_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op
+    ;
     ; GFX9-LABEL: name: v_mad_f16
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -1652,6 +1727,7 @@ body:             |
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: %op:vgpr_32 = nofpexcept V_MAD_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op
+    ;
     ; GFX10-LABEL: name: v_mad_f16
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -1684,6 +1760,7 @@ body:             |
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: %op:vgpr_32 = nofpexcept V_FMA_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op
+    ;
     ; GFX9-LABEL: name: v_fma_f16
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -1692,6 +1769,7 @@ body:             |
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: %op:vgpr_32 = nofpexcept V_FMA_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op
+    ;
     ; GFX10-LABEL: name: v_fma_f16
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -1724,6 +1802,7 @@ body:             |
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: %op:vgpr_32 = nofpexcept V_DIV_FIXUP_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op
+    ;
     ; GFX9-LABEL: name: v_div_fixup_f16
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -1732,6 +1811,7 @@ body:             |
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: %op:vgpr_32 = nofpexcept V_DIV_FIXUP_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op
+    ;
     ; GFX10-LABEL: name: v_div_fixup_f16
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -1763,6 +1843,7 @@ body:             |
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX8-NEXT: %op:vgpr_32 = nofpexcept V_MADAK_F16 [[COPY]], [[COPY1]], 1234, implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op
+    ;
     ; GFX9-LABEL: name: v_madak_f16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1771,6 +1852,7 @@ body:             |
     ; GFX9-NEXT: %op:vgpr_32 = nofpexcept V_MADAK_F16 [[COPY]], [[COPY1]], 1234, implicit $mode, implicit $exec
     ; GFX9-NEXT: %and:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %and
+    ;
     ; GFX10-LABEL: name: v_madak_f16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1800,6 +1882,7 @@ body:             |
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX8-NEXT: %op:vgpr_32 = nofpexcept V_MADMK_F16 [[COPY]], 1234, [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op
+    ;
     ; GFX9-LABEL: name: v_madmk_f16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1808,6 +1891,7 @@ body:             |
     ; GFX9-NEXT: %op:vgpr_32 = nofpexcept V_MADMK_F16 [[COPY]], 1234, [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: %and:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %and
+    ;
     ; GFX10-LABEL: name: v_madmk_f16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1837,6 +1921,7 @@ body:             |
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX8-NEXT: %op:vgpr_32 = nofpexcept V_FMAAK_F16 [[COPY]], [[COPY1]], 1234, implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op
+    ;
     ; GFX9-LABEL: name: v_fmaak_f16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1845,6 +1930,7 @@ body:             |
     ; GFX9-NEXT: %op:vgpr_32 = nofpexcept V_FMAAK_F16 [[COPY]], [[COPY1]], 1234, implicit $mode, implicit $exec
     ; GFX9-NEXT: %and:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %and
+    ;
     ; GFX10-LABEL: name: v_fmaak_f16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1874,6 +1960,7 @@ body:             |
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX8-NEXT: %op:vgpr_32 = nofpexcept V_FMAMK_F16 [[COPY]], 1234, [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op
+    ;
     ; GFX9-LABEL: name: v_fmamk_f16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1882,6 +1969,7 @@ body:             |
     ; GFX9-NEXT: %op:vgpr_32 = nofpexcept V_FMAMK_F16 [[COPY]], 1234, [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: %and:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %and
+    ;
     ; GFX10-LABEL: name: v_fmamk_f16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1914,6 +2002,7 @@ body:             |
     ; GFX8-NEXT: %op_vop3:vgpr_32 = nofpexcept V_MAC_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop2
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
+    ;
     ; GFX9-LABEL: name: v_mac_f16
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -1926,6 +2015,7 @@ body:             |
     ; GFX9-NEXT: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %and_vop2
     ; GFX9-NEXT: $vgpr0 = COPY %and_vop3
+    ;
     ; GFX10-LABEL: name: v_mac_f16
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -1966,6 +2056,7 @@ body:             |
     ; GFX8-NEXT: %op_vop3:vgpr_32 = nofpexcept V_FMAC_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop2
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
+    ;
     ; GFX9-LABEL: name: v_fmac_f16
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -1978,6 +2069,7 @@ body:             |
     ; GFX9-NEXT: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %and_vop2
     ; GFX9-NEXT: $vgpr0 = COPY %and_vop3
+    ;
     ; GFX10-LABEL: name: v_fmac_f16
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -2018,6 +2110,7 @@ body:             |
     ; GFX8-NEXT: %op:vgpr_32 = nofpexcept V_MAD_MIXLO_F16 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
     ; GFX8-NEXT: %and:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %and
+    ;
     ; GFX9-LABEL: name: no_fold_v_mad_mixlo_f16
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -2028,6 +2121,7 @@ body:             |
     ; GFX9-NEXT: %op:vgpr_32 = nofpexcept V_MAD_MIXLO_F16 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
     ; GFX9-NEXT: %and:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %and
+    ;
     ; GFX10-LABEL: name: no_fold_v_mad_mixlo_f16
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -2064,6 +2158,7 @@ body:             |
     ; GFX8-NEXT: %op:vgpr_32 = nofpexcept V_MAD_MIXHI_F16 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
     ; GFX8-NEXT: %and:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %and
+    ;
     ; GFX9-LABEL: name: no_fold_v_mad_mixhi_f16
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -2074,6 +2169,7 @@ body:             |
     ; GFX9-NEXT: %op:vgpr_32 = nofpexcept V_MAD_MIXHI_F16 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
     ; GFX9-NEXT: %and:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %and
+    ;
     ; GFX10-LABEL: name: no_fold_v_mad_mixhi_f16
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll b/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll
new file mode 100644
index 000000000000..1113acb3c030
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -O3 < %s | FileCheck %s
+
+; Test should not result in build failure
+; CHECK-LABEL: shouldNotReApply
+
+define amdgpu_kernel void @shouldNotReApply() {
+entry:
+  tail call void @llvm.amdgcn.sched.barrier(i32 0)
+  store <4 x i32> zeroinitializer, ptr addrspace(3) null, align 2147483648
+  tail call void @llvm.amdgcn.sched.group.barrier(i32 0, i32 0, i32 0)
+  tail call void @llvm.amdgcn.sched.barrier(i32 0)
+  store i32 0, ptr addrspace(5) null, align 2147483648
+  tail call void @llvm.amdgcn.sched.group.barrier(i32 0, i32 0, i32 0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/liveout-implicit-def-subreg-redef-blender-verifier-error.mir b/llvm/test/CodeGen/AMDGPU/liveout-implicit-def-subreg-redef-blender-verifier-error.mir
index c39fa08194f9..9eff5ac8a2a3 100644
--- a/llvm/test/CodeGen/AMDGPU/liveout-implicit-def-subreg-redef-blender-verifier-error.mir
+++ b/llvm/test/CodeGen/AMDGPU/liveout-implicit-def-subreg-redef-blender-verifier-error.mir
@@ -22,17 +22,17 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:sgpr_128 = S_MOV_B32 0
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 0
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:sgpr_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = IMPLICIT_DEF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   S_NOP 0, implicit %0
-  ; CHECK-NEXT:   S_NOP 0, implicit %0.sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]]
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]].sub0
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
     S_CBRANCH_SCC0 %bb.2, implicit undef $scc
@@ -68,17 +68,17 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:sgpr_128 = S_MOV_B32 0
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 0
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:sgpr_128 = S_MOV_B32 9
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   S_NOP 0, implicit %0
-  ; CHECK-NEXT:   S_NOP 0, implicit %0.sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]]
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]].sub0
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
     S_CBRANCH_SCC0 %bb.2, implicit undef $scc
@@ -116,17 +116,17 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_NOP 0, implicit-def undef %0.sub1_sub2_sub3
-  ; CHECK-NEXT:   %0.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:vreg_128 = V_MOV_B32_e32 9, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_128 = V_MOV_B32_e32 9, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   S_NOP 0, implicit %0
-  ; CHECK-NEXT:   S_NOP 0, implicit %0.sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]]
+  ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub0
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
     S_CBRANCH_SCC0 %bb.2, implicit undef $scc
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
index 4756571aaa59..04ac24948b8b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN %s
 
 declare i64 @llvm.amdgcn.dispatch.id() #1
 
diff --git a/llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir b/llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir
index 041771029702..efa24a9bee7d 100644
--- a/llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir
+++ b/llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir
@@ -43,6 +43,7 @@ body:             |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   S_ENDPGM 0
+  ;
   ; GFX11-LABEL: name: loop_header_nopred
   ; GFX11: bb.0:
   ; GFX11-NEXT:   successors: %bb.2(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir
index 1679773c945b..9eeec4fa3a93 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir
+++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir
@@ -93,7 +93,7 @@ body:             |
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, [[V_CMP_GT_I32_e64_]], implicit-def $scc
   ; CHECK-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[COPY2]], implicit-def $scc
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_OR_B32_]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_OR_B32_]]
   ; CHECK-NEXT:   $exec_lo = S_ANDN2_B32_term $exec_lo, [[S_OR_B32_]], implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_EXECNZ %bb.1, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.2
@@ -246,8 +246,8 @@ body:             |
   ; CHECK-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 1, [[S_FF1_I32_B32_]], implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[COPY8]], [[S_LSHL_B32_]], implicit-def dead $scc
   ; CHECK-NEXT:   S_CMP_LG_U32 [[S_ANDN2_B32_]], 0, implicit-def $scc
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_ADD_I32_]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_ANDN2_B32_]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_ADD_I32_]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_ANDN2_B32_]]
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.6, implicit killed $scc
   ; CHECK-NEXT:   S_BRANCH %bb.7
   ; CHECK-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir
index f04f66cfbba1..02e3d7e81fd4 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir
+++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir
@@ -40,21 +40,21 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY1]]
   ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e32 1, killed [[COPY5]], implicit $exec
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY66:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]]
-  ; CHECK-NEXT:   GLOBAL_STORE_DWORD undef %10:vreg_64, [[COPY66]], 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[COPY66]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]]
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORD undef %10:vreg_64, [[COPY6]], 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
   ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY8]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
   ; CHECK-NEXT:   [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], [[COPY8]], implicit-def dead $scc
   ; CHECK-NEXT:   $exec = S_MOV_B64_term killed [[S_AND_B64_1]]
-  ; CHECK-NEXT:   [[S_MOV_B64_term1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[S_XOR_B64_1]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[S_XOR_B64_1]], implicit $exec
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.2
   bb.0:
@@ -127,21 +127,21 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e32 1, killed [[COPY4]], implicit $exec
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY77:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]]
-  ; CHECK-NEXT:   GLOBAL_STORE_DWORD undef %11:vreg_64, [[COPY77]], 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY77]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]]
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORD undef %11:vreg_64, [[COPY7]], 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]]
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
   ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY9]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
   ; CHECK-NEXT:   [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], [[COPY9]], implicit-def dead $scc
   ; CHECK-NEXT:   $exec = S_MOV_B64_term killed [[S_AND_B64_1]]
-  ; CHECK-NEXT:   [[S_MOV_B64_term1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[S_XOR_B64_1]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[S_XOR_B64_1]], implicit $exec
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   bb.0:
@@ -216,21 +216,21 @@ body:             |
   ; CHECK-NEXT:   S_NOP 0, implicit killed [[S_MOV_B64_]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY1]]
   ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e32 1, killed [[COPY5]], implicit $exec
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY66:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]]
-  ; CHECK-NEXT:   GLOBAL_STORE_DWORD undef %11:vreg_64, [[COPY66]], 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[COPY66]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]]
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORD undef %11:vreg_64, [[COPY6]], 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
   ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY8]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
   ; CHECK-NEXT:   [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], [[COPY8]], implicit-def dead $scc
   ; CHECK-NEXT:   $exec = S_MOV_B64_term killed [[S_AND_B64_1]]
-  ; CHECK-NEXT:   [[S_MOV_B64_term1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[S_XOR_B64_1]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[S_XOR_B64_1]], implicit $exec
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.2
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir
index 0c00c9af25a7..914cc8ae8844 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir
+++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir
@@ -222,25 +222,25 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term1]]
-  ; CHECK-NEXT:   dead %7:vgpr_32 = GLOBAL_LOAD_DWORD undef %8:vreg_64, 0, 0, implicit $exec :: (volatile load (s32), addrspace 1)
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY [[COPY3]]
+  ; CHECK-NEXT:   dead [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD undef %8:vreg_64, 0, 0, implicit $exec :: (volatile load (s32), addrspace 1)
+  ; CHECK-NEXT:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = COPY [[COPY3]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_64_xexec = COPY [[COPY4]]
-  ; CHECK-NEXT:   $exec = S_OR_B64_term $exec, killed [[COPY5]], implicit-def $scc
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]]
+  ; CHECK-NEXT:   $exec = S_OR_B64_term $exec, killed [[COPY4]], implicit-def $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_SLEEP 1
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
-  ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY6]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], [[COPY6]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
+  ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY5]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], [[COPY5]], implicit-def dead $scc
   ; CHECK-NEXT:   $exec = S_MOV_B64_term killed [[S_AND_B64_1]]
+  ; CHECK-NEXT:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_1]], implicit $exec
   ; CHECK-NEXT:   [[S_MOV_B64_term1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_1]], implicit $exec
-  ; CHECK-NEXT:   [[S_MOV_B64_term2:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_1]], implicit $exec
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.2
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
index afa914c8375f..9547f08d3eba 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
@@ -269,6 +269,7 @@ define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1)
 ; GFX12-NEXT:    v_dual_mov_b32 v6, ttmp9 :: v_dual_mov_b32 v7, s0
 ; GFX12-NEXT:    s_lshr_b32 s1, ttmp7, 16
 ; GFX12-NEXT:    v_mov_b32_e32 v8, s1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v6, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[2:3], v7, off scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
index fc3ed8703cec..018da7f81e3d 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
@@ -744,8 +744,8 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
-  ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -766,8 +766,8 @@ body:             |
   ; GFX908-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = S_ADD_I32 %4.sub0, -1, implicit-def dead $scc
-  ; GFX908-NEXT:   S_CMP_LG_U32 %4.sub0, 0, implicit-def $scc
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
+  ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
   ; GFX908-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit killed $scc
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.4:
@@ -898,13 +898,13 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   undef %23.sub0:vreg_64 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   undef [[V_CVT_I32_F64_e32_23:%[0-9]+]].sub0:vreg_64 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   %23.sub1:vreg_64 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   S_NOP 0, implicit %23
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]].sub1:vreg_64 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
@@ -1003,15 +1003,15 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   undef %21.sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   undef [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   %21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   %21.sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   %21.sub3:vreg_128 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   S_NOP 0, implicit %21
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub3:vreg_128 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_21]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
@@ -1091,10 +1091,10 @@ body:             |
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
   ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-  ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
@@ -1122,87 +1122,87 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
-  ; GFX908-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1
-  ; GFX908-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 2
-  ; GFX908-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 3
-  ; GFX908-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 4
-  ; GFX908-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sgpr_32 = S_MOV_B32 5
-  ; GFX908-NEXT:   [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 6
-  ; GFX908-NEXT:   [[S_MOV_B32_7:%[0-9]+]]:sgpr_32 = S_MOV_B32 7
-  ; GFX908-NEXT:   [[S_MOV_B32_8:%[0-9]+]]:sgpr_32 = S_MOV_B32 8
-  ; GFX908-NEXT:   [[S_MOV_B32_9:%[0-9]+]]:sgpr_32 = S_MOV_B32 9
-  ; GFX908-NEXT:   [[S_MOV_B32_10:%[0-9]+]]:sgpr_32 = S_MOV_B32 10
-  ; GFX908-NEXT:   [[S_MOV_B32_11:%[0-9]+]]:sgpr_32 = S_MOV_B32 11
-  ; GFX908-NEXT:   [[S_MOV_B32_12:%[0-9]+]]:sgpr_32 = S_MOV_B32 12
-  ; GFX908-NEXT:   [[S_MOV_B32_13:%[0-9]+]]:sgpr_32 = S_MOV_B32 13
-  ; GFX908-NEXT:   [[S_MOV_B32_14:%[0-9]+]]:sgpr_32 = S_MOV_B32 14
-  ; GFX908-NEXT:   [[S_MOV_B32_15:%[0-9]+]]:sgpr_32 = S_MOV_B32 15
-  ; GFX908-NEXT:   [[S_MOV_B32_16:%[0-9]+]]:sgpr_32 = S_MOV_B32 16
-  ; GFX908-NEXT:   [[S_MOV_B32_17:%[0-9]+]]:sgpr_32 = S_MOV_B32 17
-  ; GFX908-NEXT:   [[S_MOV_B32_18:%[0-9]+]]:sgpr_32 = S_MOV_B32 18
-  ; GFX908-NEXT:   [[S_MOV_B32_19:%[0-9]+]]:sgpr_32 = S_MOV_B32 19
-  ; GFX908-NEXT:   [[S_MOV_B32_20:%[0-9]+]]:sgpr_32 = S_MOV_B32 20
-  ; GFX908-NEXT:   [[S_MOV_B32_21:%[0-9]+]]:sgpr_32 = S_MOV_B32 21
-  ; GFX908-NEXT:   [[S_MOV_B32_22:%[0-9]+]]:sgpr_32 = S_MOV_B32 22
-  ; GFX908-NEXT:   [[S_MOV_B32_23:%[0-9]+]]:sgpr_32 = S_MOV_B32 23
-  ; GFX908-NEXT:   [[S_MOV_B32_24:%[0-9]+]]:sgpr_32 = S_MOV_B32 24
-  ; GFX908-NEXT:   [[S_MOV_B32_25:%[0-9]+]]:sgpr_32 = S_MOV_B32 25
-  ; GFX908-NEXT:   [[S_MOV_B32_26:%[0-9]+]]:sgpr_32 = S_MOV_B32 26
-  ; GFX908-NEXT:   [[S_MOV_B32_27:%[0-9]+]]:sgpr_32 = S_MOV_B32 27
-  ; GFX908-NEXT:   [[S_MOV_B32_28:%[0-9]+]]:sgpr_32 = S_MOV_B32 28
-  ; GFX908-NEXT:   [[S_MOV_B32_29:%[0-9]+]]:sgpr_32 = S_MOV_B32 29
-  ; GFX908-NEXT:   [[S_MOV_B32_30:%[0-9]+]]:sgpr_32 = S_MOV_B32 30
-  ; GFX908-NEXT:   [[S_MOV_B32_31:%[0-9]+]]:sgpr_32 = S_MOV_B32 31
-  ; GFX908-NEXT:   [[S_MOV_B32_32:%[0-9]+]]:sgpr_32 = S_MOV_B32 32
-  ; GFX908-NEXT:   [[S_MOV_B32_33:%[0-9]+]]:sgpr_32 = S_MOV_B32 33
-  ; GFX908-NEXT:   [[S_MOV_B32_34:%[0-9]+]]:sgpr_32 = S_MOV_B32 34
-  ; GFX908-NEXT:   [[S_MOV_B32_35:%[0-9]+]]:sgpr_32 = S_MOV_B32 35
-  ; GFX908-NEXT:   [[S_MOV_B32_36:%[0-9]+]]:sgpr_32 = S_MOV_B32 36
-  ; GFX908-NEXT:   [[S_MOV_B32_37:%[0-9]+]]:sgpr_32 = S_MOV_B32 37
-  ; GFX908-NEXT:   [[S_MOV_B32_38:%[0-9]+]]:sgpr_32 = S_MOV_B32 38
-  ; GFX908-NEXT:   [[S_MOV_B32_39:%[0-9]+]]:sgpr_32 = S_MOV_B32 39
-  ; GFX908-NEXT:   [[S_MOV_B32_40:%[0-9]+]]:sgpr_32 = S_MOV_B32 40
-  ; GFX908-NEXT:   [[S_MOV_B32_41:%[0-9]+]]:sgpr_32 = S_MOV_B32 41
-  ; GFX908-NEXT:   [[S_MOV_B32_42:%[0-9]+]]:sgpr_32 = S_MOV_B32 42
-  ; GFX908-NEXT:   [[S_MOV_B32_43:%[0-9]+]]:sgpr_32 = S_MOV_B32 43
-  ; GFX908-NEXT:   [[S_MOV_B32_44:%[0-9]+]]:sgpr_32 = S_MOV_B32 44
-  ; GFX908-NEXT:   [[S_MOV_B32_45:%[0-9]+]]:sgpr_32 = S_MOV_B32 45
-  ; GFX908-NEXT:   [[S_MOV_B32_46:%[0-9]+]]:sgpr_32 = S_MOV_B32 46
-  ; GFX908-NEXT:   [[S_MOV_B32_47:%[0-9]+]]:sgpr_32 = S_MOV_B32 47
-  ; GFX908-NEXT:   [[S_MOV_B32_48:%[0-9]+]]:sgpr_32 = S_MOV_B32 48
-  ; GFX908-NEXT:   [[S_MOV_B32_49:%[0-9]+]]:sgpr_32 = S_MOV_B32 49
-  ; GFX908-NEXT:   [[S_MOV_B32_50:%[0-9]+]]:sgpr_32 = S_MOV_B32 50
-  ; GFX908-NEXT:   [[S_MOV_B32_51:%[0-9]+]]:sgpr_32 = S_MOV_B32 51
-  ; GFX908-NEXT:   [[S_MOV_B32_52:%[0-9]+]]:sgpr_32 = S_MOV_B32 52
-  ; GFX908-NEXT:   [[S_MOV_B32_53:%[0-9]+]]:sgpr_32 = S_MOV_B32 53
-  ; GFX908-NEXT:   [[S_MOV_B32_54:%[0-9]+]]:sgpr_32 = S_MOV_B32 54
-  ; GFX908-NEXT:   [[S_MOV_B32_55:%[0-9]+]]:sgpr_32 = S_MOV_B32 55
-  ; GFX908-NEXT:   [[S_MOV_B32_56:%[0-9]+]]:sgpr_32 = S_MOV_B32 56
-  ; GFX908-NEXT:   [[S_MOV_B32_57:%[0-9]+]]:sgpr_32 = S_MOV_B32 57
-  ; GFX908-NEXT:   [[S_MOV_B32_58:%[0-9]+]]:sgpr_32 = S_MOV_B32 58
-  ; GFX908-NEXT:   [[S_MOV_B32_59:%[0-9]+]]:sgpr_32 = S_MOV_B32 59
-  ; GFX908-NEXT:   [[S_MOV_B32_60:%[0-9]+]]:sgpr_32 = S_MOV_B32 60
-  ; GFX908-NEXT:   [[S_MOV_B32_61:%[0-9]+]]:sgpr_32 = S_MOV_B32 61
-  ; GFX908-NEXT:   [[S_MOV_B32_62:%[0-9]+]]:sgpr_32 = S_MOV_B32 62
-  ; GFX908-NEXT:   [[S_MOV_B32_63:%[0-9]+]]:sgpr_32 = S_MOV_B32 63
-  ; GFX908-NEXT:   [[S_MOV_B32_64:%[0-9]+]]:sgpr_32 = S_MOV_B32 64
-  ; GFX908-NEXT:   [[S_MOV_B32_65:%[0-9]+]]:sgpr_32 = S_MOV_B32 65
-  ; GFX908-NEXT:   [[S_MOV_B32_66:%[0-9]+]]:sgpr_32 = S_MOV_B32 66
-  ; GFX908-NEXT:   [[S_MOV_B32_67:%[0-9]+]]:sgpr_32 = S_MOV_B32 67
-  ; GFX908-NEXT:   [[S_MOV_B32_68:%[0-9]+]]:sgpr_32 = S_MOV_B32 68
-  ; GFX908-NEXT:   [[S_MOV_B32_69:%[0-9]+]]:sgpr_32 = S_MOV_B32 69
-  ; GFX908-NEXT:   [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 70
-  ; GFX908-NEXT:   [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 71
-  ; GFX908-NEXT:   [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 72
-  ; GFX908-NEXT:   [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 73
-  ; GFX908-NEXT:   [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 74
-  ; GFX908-NEXT:   [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 75
-  ; GFX908-NEXT:   [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 76
-  ; GFX908-NEXT:   [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
-  ; GFX908-NEXT:   [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 78
-  ; GFX908-NEXT:   [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 79
-  ; GFX908-NEXT:   [[S_MOV_B32_80:%[0-9]+]]:sgpr_32 = S_MOV_B32 80
+  ; GFX908-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; GFX908-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1
+  ; GFX908-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 2
+  ; GFX908-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 3
+  ; GFX908-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sgpr_32 = S_MOV_B32 4
+  ; GFX908-NEXT:   [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 5
+  ; GFX908-NEXT:   [[S_MOV_B32_7:%[0-9]+]]:sgpr_32 = S_MOV_B32 6
+  ; GFX908-NEXT:   [[S_MOV_B32_8:%[0-9]+]]:sgpr_32 = S_MOV_B32 7
+  ; GFX908-NEXT:   [[S_MOV_B32_9:%[0-9]+]]:sgpr_32 = S_MOV_B32 8
+  ; GFX908-NEXT:   [[S_MOV_B32_10:%[0-9]+]]:sgpr_32 = S_MOV_B32 9
+  ; GFX908-NEXT:   [[S_MOV_B32_11:%[0-9]+]]:sgpr_32 = S_MOV_B32 10
+  ; GFX908-NEXT:   [[S_MOV_B32_12:%[0-9]+]]:sgpr_32 = S_MOV_B32 11
+  ; GFX908-NEXT:   [[S_MOV_B32_13:%[0-9]+]]:sgpr_32 = S_MOV_B32 12
+  ; GFX908-NEXT:   [[S_MOV_B32_14:%[0-9]+]]:sgpr_32 = S_MOV_B32 13
+  ; GFX908-NEXT:   [[S_MOV_B32_15:%[0-9]+]]:sgpr_32 = S_MOV_B32 14
+  ; GFX908-NEXT:   [[S_MOV_B32_16:%[0-9]+]]:sgpr_32 = S_MOV_B32 15
+  ; GFX908-NEXT:   [[S_MOV_B32_17:%[0-9]+]]:sgpr_32 = S_MOV_B32 16
+  ; GFX908-NEXT:   [[S_MOV_B32_18:%[0-9]+]]:sgpr_32 = S_MOV_B32 17
+  ; GFX908-NEXT:   [[S_MOV_B32_19:%[0-9]+]]:sgpr_32 = S_MOV_B32 18
+  ; GFX908-NEXT:   [[S_MOV_B32_20:%[0-9]+]]:sgpr_32 = S_MOV_B32 19
+  ; GFX908-NEXT:   [[S_MOV_B32_21:%[0-9]+]]:sgpr_32 = S_MOV_B32 20
+  ; GFX908-NEXT:   [[S_MOV_B32_22:%[0-9]+]]:sgpr_32 = S_MOV_B32 21
+  ; GFX908-NEXT:   [[S_MOV_B32_23:%[0-9]+]]:sgpr_32 = S_MOV_B32 22
+  ; GFX908-NEXT:   [[S_MOV_B32_24:%[0-9]+]]:sgpr_32 = S_MOV_B32 23
+  ; GFX908-NEXT:   [[S_MOV_B32_25:%[0-9]+]]:sgpr_32 = S_MOV_B32 24
+  ; GFX908-NEXT:   [[S_MOV_B32_26:%[0-9]+]]:sgpr_32 = S_MOV_B32 25
+  ; GFX908-NEXT:   [[S_MOV_B32_27:%[0-9]+]]:sgpr_32 = S_MOV_B32 26
+  ; GFX908-NEXT:   [[S_MOV_B32_28:%[0-9]+]]:sgpr_32 = S_MOV_B32 27
+  ; GFX908-NEXT:   [[S_MOV_B32_29:%[0-9]+]]:sgpr_32 = S_MOV_B32 28
+  ; GFX908-NEXT:   [[S_MOV_B32_30:%[0-9]+]]:sgpr_32 = S_MOV_B32 29
+  ; GFX908-NEXT:   [[S_MOV_B32_31:%[0-9]+]]:sgpr_32 = S_MOV_B32 30
+  ; GFX908-NEXT:   [[S_MOV_B32_32:%[0-9]+]]:sgpr_32 = S_MOV_B32 31
+  ; GFX908-NEXT:   [[S_MOV_B32_33:%[0-9]+]]:sgpr_32 = S_MOV_B32 32
+  ; GFX908-NEXT:   [[S_MOV_B32_34:%[0-9]+]]:sgpr_32 = S_MOV_B32 33
+  ; GFX908-NEXT:   [[S_MOV_B32_35:%[0-9]+]]:sgpr_32 = S_MOV_B32 34
+  ; GFX908-NEXT:   [[S_MOV_B32_36:%[0-9]+]]:sgpr_32 = S_MOV_B32 35
+  ; GFX908-NEXT:   [[S_MOV_B32_37:%[0-9]+]]:sgpr_32 = S_MOV_B32 36
+  ; GFX908-NEXT:   [[S_MOV_B32_38:%[0-9]+]]:sgpr_32 = S_MOV_B32 37
+  ; GFX908-NEXT:   [[S_MOV_B32_39:%[0-9]+]]:sgpr_32 = S_MOV_B32 38
+  ; GFX908-NEXT:   [[S_MOV_B32_40:%[0-9]+]]:sgpr_32 = S_MOV_B32 39
+  ; GFX908-NEXT:   [[S_MOV_B32_41:%[0-9]+]]:sgpr_32 = S_MOV_B32 40
+  ; GFX908-NEXT:   [[S_MOV_B32_42:%[0-9]+]]:sgpr_32 = S_MOV_B32 41
+  ; GFX908-NEXT:   [[S_MOV_B32_43:%[0-9]+]]:sgpr_32 = S_MOV_B32 42
+  ; GFX908-NEXT:   [[S_MOV_B32_44:%[0-9]+]]:sgpr_32 = S_MOV_B32 43
+  ; GFX908-NEXT:   [[S_MOV_B32_45:%[0-9]+]]:sgpr_32 = S_MOV_B32 44
+  ; GFX908-NEXT:   [[S_MOV_B32_46:%[0-9]+]]:sgpr_32 = S_MOV_B32 45
+  ; GFX908-NEXT:   [[S_MOV_B32_47:%[0-9]+]]:sgpr_32 = S_MOV_B32 46
+  ; GFX908-NEXT:   [[S_MOV_B32_48:%[0-9]+]]:sgpr_32 = S_MOV_B32 47
+  ; GFX908-NEXT:   [[S_MOV_B32_49:%[0-9]+]]:sgpr_32 = S_MOV_B32 48
+  ; GFX908-NEXT:   [[S_MOV_B32_50:%[0-9]+]]:sgpr_32 = S_MOV_B32 49
+  ; GFX908-NEXT:   [[S_MOV_B32_51:%[0-9]+]]:sgpr_32 = S_MOV_B32 50
+  ; GFX908-NEXT:   [[S_MOV_B32_52:%[0-9]+]]:sgpr_32 = S_MOV_B32 51
+  ; GFX908-NEXT:   [[S_MOV_B32_53:%[0-9]+]]:sgpr_32 = S_MOV_B32 52
+  ; GFX908-NEXT:   [[S_MOV_B32_54:%[0-9]+]]:sgpr_32 = S_MOV_B32 53
+  ; GFX908-NEXT:   [[S_MOV_B32_55:%[0-9]+]]:sgpr_32 = S_MOV_B32 54
+  ; GFX908-NEXT:   [[S_MOV_B32_56:%[0-9]+]]:sgpr_32 = S_MOV_B32 55
+  ; GFX908-NEXT:   [[S_MOV_B32_57:%[0-9]+]]:sgpr_32 = S_MOV_B32 56
+  ; GFX908-NEXT:   [[S_MOV_B32_58:%[0-9]+]]:sgpr_32 = S_MOV_B32 57
+  ; GFX908-NEXT:   [[S_MOV_B32_59:%[0-9]+]]:sgpr_32 = S_MOV_B32 58
+  ; GFX908-NEXT:   [[S_MOV_B32_60:%[0-9]+]]:sgpr_32 = S_MOV_B32 59
+  ; GFX908-NEXT:   [[S_MOV_B32_61:%[0-9]+]]:sgpr_32 = S_MOV_B32 60
+  ; GFX908-NEXT:   [[S_MOV_B32_62:%[0-9]+]]:sgpr_32 = S_MOV_B32 61
+  ; GFX908-NEXT:   [[S_MOV_B32_63:%[0-9]+]]:sgpr_32 = S_MOV_B32 62
+  ; GFX908-NEXT:   [[S_MOV_B32_64:%[0-9]+]]:sgpr_32 = S_MOV_B32 63
+  ; GFX908-NEXT:   [[S_MOV_B32_65:%[0-9]+]]:sgpr_32 = S_MOV_B32 64
+  ; GFX908-NEXT:   [[S_MOV_B32_66:%[0-9]+]]:sgpr_32 = S_MOV_B32 65
+  ; GFX908-NEXT:   [[S_MOV_B32_67:%[0-9]+]]:sgpr_32 = S_MOV_B32 66
+  ; GFX908-NEXT:   [[S_MOV_B32_68:%[0-9]+]]:sgpr_32 = S_MOV_B32 67
+  ; GFX908-NEXT:   [[S_MOV_B32_69:%[0-9]+]]:sgpr_32 = S_MOV_B32 68
+  ; GFX908-NEXT:   [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 69
+  ; GFX908-NEXT:   [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 70
+  ; GFX908-NEXT:   [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 71
+  ; GFX908-NEXT:   [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 72
+  ; GFX908-NEXT:   [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 73
+  ; GFX908-NEXT:   [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 74
+  ; GFX908-NEXT:   [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 75
+  ; GFX908-NEXT:   [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 76
+  ; GFX908-NEXT:   [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
+  ; GFX908-NEXT:   [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 78
+  ; GFX908-NEXT:   [[S_MOV_B32_80:%[0-9]+]]:sgpr_32 = S_MOV_B32 79
+  ; GFX908-NEXT:   [[S_MOV_B32_81:%[0-9]+]]:sgpr_32 = S_MOV_B32 80
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -1224,8 +1224,8 @@ body:             |
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]]
   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = S_ADD_I32 %4.sub0, -1, implicit-def dead $scc
-  ; GFX908-NEXT:   S_CMP_LG_U32 %4.sub0, 0, implicit-def $scc
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
+  ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
   ; GFX908-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit killed $scc
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.4:
@@ -1248,47 +1248,47 @@ body:             |
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_4]], implicit [[S_MOV_B32_5]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_6]], implicit [[S_MOV_B32_7]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_8]], implicit [[S_MOV_B32_9]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_10]], implicit [[S_MOV_B32_11]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_12]], implicit [[S_MOV_B32_13]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_14]], implicit [[S_MOV_B32_15]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_16]], implicit [[S_MOV_B32_17]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_18]], implicit [[S_MOV_B32_19]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_20]], implicit [[S_MOV_B32_21]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_22]], implicit [[S_MOV_B32_23]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_24]], implicit [[S_MOV_B32_25]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_26]], implicit [[S_MOV_B32_27]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_28]], implicit [[S_MOV_B32_29]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_30]], implicit [[S_MOV_B32_31]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_32]], implicit [[S_MOV_B32_33]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_34]], implicit [[S_MOV_B32_35]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_36]], implicit [[S_MOV_B32_37]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_38]], implicit [[S_MOV_B32_39]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_40]], implicit [[S_MOV_B32_41]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_42]], implicit [[S_MOV_B32_43]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_44]], implicit [[S_MOV_B32_45]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_46]], implicit [[S_MOV_B32_47]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_48]], implicit [[S_MOV_B32_49]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_50]], implicit [[S_MOV_B32_51]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_52]], implicit [[S_MOV_B32_53]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_54]], implicit [[S_MOV_B32_55]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_56]], implicit [[S_MOV_B32_57]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_58]], implicit [[S_MOV_B32_59]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_60]], implicit [[S_MOV_B32_61]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_62]], implicit [[S_MOV_B32_63]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_64]], implicit [[S_MOV_B32_65]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_66]], implicit [[S_MOV_B32_67]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_68]], implicit [[S_MOV_B32_69]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_70]], implicit [[S_MOV_B32_71]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_72]], implicit [[S_MOV_B32_73]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_74]], implicit [[S_MOV_B32_75]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_76]], implicit [[S_MOV_B32_77]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_78]], implicit [[S_MOV_B32_79]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_80]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_5]], implicit [[S_MOV_B32_6]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_7]], implicit [[S_MOV_B32_8]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_9]], implicit [[S_MOV_B32_10]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_11]], implicit [[S_MOV_B32_12]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_13]], implicit [[S_MOV_B32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_15]], implicit [[S_MOV_B32_16]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_17]], implicit [[S_MOV_B32_18]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_19]], implicit [[S_MOV_B32_20]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_21]], implicit [[S_MOV_B32_22]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_23]], implicit [[S_MOV_B32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_25]], implicit [[S_MOV_B32_26]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_27]], implicit [[S_MOV_B32_28]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_29]], implicit [[S_MOV_B32_30]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_31]], implicit [[S_MOV_B32_32]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_33]], implicit [[S_MOV_B32_34]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_35]], implicit [[S_MOV_B32_36]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_37]], implicit [[S_MOV_B32_38]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_39]], implicit [[S_MOV_B32_40]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_41]], implicit [[S_MOV_B32_42]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_43]], implicit [[S_MOV_B32_44]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_45]], implicit [[S_MOV_B32_46]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_47]], implicit [[S_MOV_B32_48]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_49]], implicit [[S_MOV_B32_50]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_51]], implicit [[S_MOV_B32_52]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_53]], implicit [[S_MOV_B32_54]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_55]], implicit [[S_MOV_B32_56]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_57]], implicit [[S_MOV_B32_58]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_59]], implicit [[S_MOV_B32_60]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_61]], implicit [[S_MOV_B32_62]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_63]], implicit [[S_MOV_B32_64]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_65]], implicit [[S_MOV_B32_66]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_67]], implicit [[S_MOV_B32_68]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_69]], implicit [[S_MOV_B32_70]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_71]], implicit [[S_MOV_B32_72]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_73]], implicit [[S_MOV_B32_74]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_75]], implicit [[S_MOV_B32_76]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_77]], implicit [[S_MOV_B32_78]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_79]], implicit [[S_MOV_B32_80]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_81]]
   ; GFX908-NEXT:   S_ENDPGM 0
   bb.0:
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -1620,10 +1620,10 @@ body:             |
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
   ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-  ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
@@ -1647,87 +1647,87 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
-  ; GFX908-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1
-  ; GFX908-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 2
-  ; GFX908-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 3
-  ; GFX908-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 4
-  ; GFX908-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sgpr_32 = S_MOV_B32 5
-  ; GFX908-NEXT:   [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 6
-  ; GFX908-NEXT:   [[S_MOV_B32_7:%[0-9]+]]:sgpr_32 = S_MOV_B32 7
-  ; GFX908-NEXT:   [[S_MOV_B32_8:%[0-9]+]]:sgpr_32 = S_MOV_B32 8
-  ; GFX908-NEXT:   [[S_MOV_B32_9:%[0-9]+]]:sgpr_32 = S_MOV_B32 9
-  ; GFX908-NEXT:   [[S_MOV_B32_10:%[0-9]+]]:sgpr_32 = S_MOV_B32 10
-  ; GFX908-NEXT:   [[S_MOV_B32_11:%[0-9]+]]:sgpr_32 = S_MOV_B32 11
-  ; GFX908-NEXT:   [[S_MOV_B32_12:%[0-9]+]]:sgpr_32 = S_MOV_B32 12
-  ; GFX908-NEXT:   [[S_MOV_B32_13:%[0-9]+]]:sgpr_32 = S_MOV_B32 13
-  ; GFX908-NEXT:   [[S_MOV_B32_14:%[0-9]+]]:sgpr_32 = S_MOV_B32 14
-  ; GFX908-NEXT:   [[S_MOV_B32_15:%[0-9]+]]:sgpr_32 = S_MOV_B32 15
-  ; GFX908-NEXT:   [[S_MOV_B32_16:%[0-9]+]]:sgpr_32 = S_MOV_B32 16
-  ; GFX908-NEXT:   [[S_MOV_B32_17:%[0-9]+]]:sgpr_32 = S_MOV_B32 17
-  ; GFX908-NEXT:   [[S_MOV_B32_18:%[0-9]+]]:sgpr_32 = S_MOV_B32 18
-  ; GFX908-NEXT:   [[S_MOV_B32_19:%[0-9]+]]:sgpr_32 = S_MOV_B32 19
-  ; GFX908-NEXT:   [[S_MOV_B32_20:%[0-9]+]]:sgpr_32 = S_MOV_B32 20
-  ; GFX908-NEXT:   [[S_MOV_B32_21:%[0-9]+]]:sgpr_32 = S_MOV_B32 21
-  ; GFX908-NEXT:   [[S_MOV_B32_22:%[0-9]+]]:sgpr_32 = S_MOV_B32 22
-  ; GFX908-NEXT:   [[S_MOV_B32_23:%[0-9]+]]:sgpr_32 = S_MOV_B32 23
-  ; GFX908-NEXT:   [[S_MOV_B32_24:%[0-9]+]]:sgpr_32 = S_MOV_B32 24
-  ; GFX908-NEXT:   [[S_MOV_B32_25:%[0-9]+]]:sgpr_32 = S_MOV_B32 25
-  ; GFX908-NEXT:   [[S_MOV_B32_26:%[0-9]+]]:sgpr_32 = S_MOV_B32 26
-  ; GFX908-NEXT:   [[S_MOV_B32_27:%[0-9]+]]:sgpr_32 = S_MOV_B32 27
-  ; GFX908-NEXT:   [[S_MOV_B32_28:%[0-9]+]]:sgpr_32 = S_MOV_B32 28
-  ; GFX908-NEXT:   [[S_MOV_B32_29:%[0-9]+]]:sgpr_32 = S_MOV_B32 29
-  ; GFX908-NEXT:   [[S_MOV_B32_30:%[0-9]+]]:sgpr_32 = S_MOV_B32 30
-  ; GFX908-NEXT:   [[S_MOV_B32_31:%[0-9]+]]:sgpr_32 = S_MOV_B32 31
-  ; GFX908-NEXT:   [[S_MOV_B32_32:%[0-9]+]]:sgpr_32 = S_MOV_B32 32
-  ; GFX908-NEXT:   [[S_MOV_B32_33:%[0-9]+]]:sgpr_32 = S_MOV_B32 33
-  ; GFX908-NEXT:   [[S_MOV_B32_34:%[0-9]+]]:sgpr_32 = S_MOV_B32 34
-  ; GFX908-NEXT:   [[S_MOV_B32_35:%[0-9]+]]:sgpr_32 = S_MOV_B32 35
-  ; GFX908-NEXT:   [[S_MOV_B32_36:%[0-9]+]]:sgpr_32 = S_MOV_B32 36
-  ; GFX908-NEXT:   [[S_MOV_B32_37:%[0-9]+]]:sgpr_32 = S_MOV_B32 37
-  ; GFX908-NEXT:   [[S_MOV_B32_38:%[0-9]+]]:sgpr_32 = S_MOV_B32 38
-  ; GFX908-NEXT:   [[S_MOV_B32_39:%[0-9]+]]:sgpr_32 = S_MOV_B32 39
-  ; GFX908-NEXT:   [[S_MOV_B32_40:%[0-9]+]]:sgpr_32 = S_MOV_B32 40
-  ; GFX908-NEXT:   [[S_MOV_B32_41:%[0-9]+]]:sgpr_32 = S_MOV_B32 41
-  ; GFX908-NEXT:   [[S_MOV_B32_42:%[0-9]+]]:sgpr_32 = S_MOV_B32 42
-  ; GFX908-NEXT:   [[S_MOV_B32_43:%[0-9]+]]:sgpr_32 = S_MOV_B32 43
-  ; GFX908-NEXT:   [[S_MOV_B32_44:%[0-9]+]]:sgpr_32 = S_MOV_B32 44
-  ; GFX908-NEXT:   [[S_MOV_B32_45:%[0-9]+]]:sgpr_32 = S_MOV_B32 45
-  ; GFX908-NEXT:   [[S_MOV_B32_46:%[0-9]+]]:sgpr_32 = S_MOV_B32 46
-  ; GFX908-NEXT:   [[S_MOV_B32_47:%[0-9]+]]:sgpr_32 = S_MOV_B32 47
-  ; GFX908-NEXT:   [[S_MOV_B32_48:%[0-9]+]]:sgpr_32 = S_MOV_B32 48
-  ; GFX908-NEXT:   [[S_MOV_B32_49:%[0-9]+]]:sgpr_32 = S_MOV_B32 49
-  ; GFX908-NEXT:   [[S_MOV_B32_50:%[0-9]+]]:sgpr_32 = S_MOV_B32 50
-  ; GFX908-NEXT:   [[S_MOV_B32_51:%[0-9]+]]:sgpr_32 = S_MOV_B32 51
-  ; GFX908-NEXT:   [[S_MOV_B32_52:%[0-9]+]]:sgpr_32 = S_MOV_B32 52
-  ; GFX908-NEXT:   [[S_MOV_B32_53:%[0-9]+]]:sgpr_32 = S_MOV_B32 53
-  ; GFX908-NEXT:   [[S_MOV_B32_54:%[0-9]+]]:sgpr_32 = S_MOV_B32 54
-  ; GFX908-NEXT:   [[S_MOV_B32_55:%[0-9]+]]:sgpr_32 = S_MOV_B32 55
-  ; GFX908-NEXT:   [[S_MOV_B32_56:%[0-9]+]]:sgpr_32 = S_MOV_B32 56
-  ; GFX908-NEXT:   [[S_MOV_B32_57:%[0-9]+]]:sgpr_32 = S_MOV_B32 57
-  ; GFX908-NEXT:   [[S_MOV_B32_58:%[0-9]+]]:sgpr_32 = S_MOV_B32 58
-  ; GFX908-NEXT:   [[S_MOV_B32_59:%[0-9]+]]:sgpr_32 = S_MOV_B32 59
-  ; GFX908-NEXT:   [[S_MOV_B32_60:%[0-9]+]]:sgpr_32 = S_MOV_B32 60
-  ; GFX908-NEXT:   [[S_MOV_B32_61:%[0-9]+]]:sgpr_32 = S_MOV_B32 61
-  ; GFX908-NEXT:   [[S_MOV_B32_62:%[0-9]+]]:sgpr_32 = S_MOV_B32 62
-  ; GFX908-NEXT:   [[S_MOV_B32_63:%[0-9]+]]:sgpr_32 = S_MOV_B32 63
-  ; GFX908-NEXT:   [[S_MOV_B32_64:%[0-9]+]]:sgpr_32 = S_MOV_B32 64
-  ; GFX908-NEXT:   [[S_MOV_B32_65:%[0-9]+]]:sgpr_32 = S_MOV_B32 65
-  ; GFX908-NEXT:   [[S_MOV_B32_66:%[0-9]+]]:sgpr_32 = S_MOV_B32 66
-  ; GFX908-NEXT:   [[S_MOV_B32_67:%[0-9]+]]:sgpr_32 = S_MOV_B32 67
-  ; GFX908-NEXT:   [[S_MOV_B32_68:%[0-9]+]]:sgpr_32 = S_MOV_B32 68
-  ; GFX908-NEXT:   [[S_MOV_B32_69:%[0-9]+]]:sgpr_32 = S_MOV_B32 69
-  ; GFX908-NEXT:   [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 70
-  ; GFX908-NEXT:   [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 71
-  ; GFX908-NEXT:   [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 72
-  ; GFX908-NEXT:   [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 73
-  ; GFX908-NEXT:   [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 74
-  ; GFX908-NEXT:   [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 75
-  ; GFX908-NEXT:   [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 76
-  ; GFX908-NEXT:   [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
-  ; GFX908-NEXT:   [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 78
-  ; GFX908-NEXT:   [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 79
-  ; GFX908-NEXT:   [[S_MOV_B32_80:%[0-9]+]]:sgpr_32 = S_MOV_B32 80
+  ; GFX908-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; GFX908-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1
+  ; GFX908-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 2
+  ; GFX908-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 3
+  ; GFX908-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sgpr_32 = S_MOV_B32 4
+  ; GFX908-NEXT:   [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 5
+  ; GFX908-NEXT:   [[S_MOV_B32_7:%[0-9]+]]:sgpr_32 = S_MOV_B32 6
+  ; GFX908-NEXT:   [[S_MOV_B32_8:%[0-9]+]]:sgpr_32 = S_MOV_B32 7
+  ; GFX908-NEXT:   [[S_MOV_B32_9:%[0-9]+]]:sgpr_32 = S_MOV_B32 8
+  ; GFX908-NEXT:   [[S_MOV_B32_10:%[0-9]+]]:sgpr_32 = S_MOV_B32 9
+  ; GFX908-NEXT:   [[S_MOV_B32_11:%[0-9]+]]:sgpr_32 = S_MOV_B32 10
+  ; GFX908-NEXT:   [[S_MOV_B32_12:%[0-9]+]]:sgpr_32 = S_MOV_B32 11
+  ; GFX908-NEXT:   [[S_MOV_B32_13:%[0-9]+]]:sgpr_32 = S_MOV_B32 12
+  ; GFX908-NEXT:   [[S_MOV_B32_14:%[0-9]+]]:sgpr_32 = S_MOV_B32 13
+  ; GFX908-NEXT:   [[S_MOV_B32_15:%[0-9]+]]:sgpr_32 = S_MOV_B32 14
+  ; GFX908-NEXT:   [[S_MOV_B32_16:%[0-9]+]]:sgpr_32 = S_MOV_B32 15
+  ; GFX908-NEXT:   [[S_MOV_B32_17:%[0-9]+]]:sgpr_32 = S_MOV_B32 16
+  ; GFX908-NEXT:   [[S_MOV_B32_18:%[0-9]+]]:sgpr_32 = S_MOV_B32 17
+  ; GFX908-NEXT:   [[S_MOV_B32_19:%[0-9]+]]:sgpr_32 = S_MOV_B32 18
+  ; GFX908-NEXT:   [[S_MOV_B32_20:%[0-9]+]]:sgpr_32 = S_MOV_B32 19
+  ; GFX908-NEXT:   [[S_MOV_B32_21:%[0-9]+]]:sgpr_32 = S_MOV_B32 20
+  ; GFX908-NEXT:   [[S_MOV_B32_22:%[0-9]+]]:sgpr_32 = S_MOV_B32 21
+  ; GFX908-NEXT:   [[S_MOV_B32_23:%[0-9]+]]:sgpr_32 = S_MOV_B32 22
+  ; GFX908-NEXT:   [[S_MOV_B32_24:%[0-9]+]]:sgpr_32 = S_MOV_B32 23
+  ; GFX908-NEXT:   [[S_MOV_B32_25:%[0-9]+]]:sgpr_32 = S_MOV_B32 24
+  ; GFX908-NEXT:   [[S_MOV_B32_26:%[0-9]+]]:sgpr_32 = S_MOV_B32 25
+  ; GFX908-NEXT:   [[S_MOV_B32_27:%[0-9]+]]:sgpr_32 = S_MOV_B32 26
+  ; GFX908-NEXT:   [[S_MOV_B32_28:%[0-9]+]]:sgpr_32 = S_MOV_B32 27
+  ; GFX908-NEXT:   [[S_MOV_B32_29:%[0-9]+]]:sgpr_32 = S_MOV_B32 28
+  ; GFX908-NEXT:   [[S_MOV_B32_30:%[0-9]+]]:sgpr_32 = S_MOV_B32 29
+  ; GFX908-NEXT:   [[S_MOV_B32_31:%[0-9]+]]:sgpr_32 = S_MOV_B32 30
+  ; GFX908-NEXT:   [[S_MOV_B32_32:%[0-9]+]]:sgpr_32 = S_MOV_B32 31
+  ; GFX908-NEXT:   [[S_MOV_B32_33:%[0-9]+]]:sgpr_32 = S_MOV_B32 32
+  ; GFX908-NEXT:   [[S_MOV_B32_34:%[0-9]+]]:sgpr_32 = S_MOV_B32 33
+  ; GFX908-NEXT:   [[S_MOV_B32_35:%[0-9]+]]:sgpr_32 = S_MOV_B32 34
+  ; GFX908-NEXT:   [[S_MOV_B32_36:%[0-9]+]]:sgpr_32 = S_MOV_B32 35
+  ; GFX908-NEXT:   [[S_MOV_B32_37:%[0-9]+]]:sgpr_32 = S_MOV_B32 36
+  ; GFX908-NEXT:   [[S_MOV_B32_38:%[0-9]+]]:sgpr_32 = S_MOV_B32 37
+  ; GFX908-NEXT:   [[S_MOV_B32_39:%[0-9]+]]:sgpr_32 = S_MOV_B32 38
+  ; GFX908-NEXT:   [[S_MOV_B32_40:%[0-9]+]]:sgpr_32 = S_MOV_B32 39
+  ; GFX908-NEXT:   [[S_MOV_B32_41:%[0-9]+]]:sgpr_32 = S_MOV_B32 40
+  ; GFX908-NEXT:   [[S_MOV_B32_42:%[0-9]+]]:sgpr_32 = S_MOV_B32 41
+  ; GFX908-NEXT:   [[S_MOV_B32_43:%[0-9]+]]:sgpr_32 = S_MOV_B32 42
+  ; GFX908-NEXT:   [[S_MOV_B32_44:%[0-9]+]]:sgpr_32 = S_MOV_B32 43
+  ; GFX908-NEXT:   [[S_MOV_B32_45:%[0-9]+]]:sgpr_32 = S_MOV_B32 44
+  ; GFX908-NEXT:   [[S_MOV_B32_46:%[0-9]+]]:sgpr_32 = S_MOV_B32 45
+  ; GFX908-NEXT:   [[S_MOV_B32_47:%[0-9]+]]:sgpr_32 = S_MOV_B32 46
+  ; GFX908-NEXT:   [[S_MOV_B32_48:%[0-9]+]]:sgpr_32 = S_MOV_B32 47
+  ; GFX908-NEXT:   [[S_MOV_B32_49:%[0-9]+]]:sgpr_32 = S_MOV_B32 48
+  ; GFX908-NEXT:   [[S_MOV_B32_50:%[0-9]+]]:sgpr_32 = S_MOV_B32 49
+  ; GFX908-NEXT:   [[S_MOV_B32_51:%[0-9]+]]:sgpr_32 = S_MOV_B32 50
+  ; GFX908-NEXT:   [[S_MOV_B32_52:%[0-9]+]]:sgpr_32 = S_MOV_B32 51
+  ; GFX908-NEXT:   [[S_MOV_B32_53:%[0-9]+]]:sgpr_32 = S_MOV_B32 52
+  ; GFX908-NEXT:   [[S_MOV_B32_54:%[0-9]+]]:sgpr_32 = S_MOV_B32 53
+  ; GFX908-NEXT:   [[S_MOV_B32_55:%[0-9]+]]:sgpr_32 = S_MOV_B32 54
+  ; GFX908-NEXT:   [[S_MOV_B32_56:%[0-9]+]]:sgpr_32 = S_MOV_B32 55
+  ; GFX908-NEXT:   [[S_MOV_B32_57:%[0-9]+]]:sgpr_32 = S_MOV_B32 56
+  ; GFX908-NEXT:   [[S_MOV_B32_58:%[0-9]+]]:sgpr_32 = S_MOV_B32 57
+  ; GFX908-NEXT:   [[S_MOV_B32_59:%[0-9]+]]:sgpr_32 = S_MOV_B32 58
+  ; GFX908-NEXT:   [[S_MOV_B32_60:%[0-9]+]]:sgpr_32 = S_MOV_B32 59
+  ; GFX908-NEXT:   [[S_MOV_B32_61:%[0-9]+]]:sgpr_32 = S_MOV_B32 60
+  ; GFX908-NEXT:   [[S_MOV_B32_62:%[0-9]+]]:sgpr_32 = S_MOV_B32 61
+  ; GFX908-NEXT:   [[S_MOV_B32_63:%[0-9]+]]:sgpr_32 = S_MOV_B32 62
+  ; GFX908-NEXT:   [[S_MOV_B32_64:%[0-9]+]]:sgpr_32 = S_MOV_B32 63
+  ; GFX908-NEXT:   [[S_MOV_B32_65:%[0-9]+]]:sgpr_32 = S_MOV_B32 64
+  ; GFX908-NEXT:   [[S_MOV_B32_66:%[0-9]+]]:sgpr_32 = S_MOV_B32 65
+  ; GFX908-NEXT:   [[S_MOV_B32_67:%[0-9]+]]:sgpr_32 = S_MOV_B32 66
+  ; GFX908-NEXT:   [[S_MOV_B32_68:%[0-9]+]]:sgpr_32 = S_MOV_B32 67
+  ; GFX908-NEXT:   [[S_MOV_B32_69:%[0-9]+]]:sgpr_32 = S_MOV_B32 68
+  ; GFX908-NEXT:   [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 69
+  ; GFX908-NEXT:   [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 70
+  ; GFX908-NEXT:   [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 71
+  ; GFX908-NEXT:   [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 72
+  ; GFX908-NEXT:   [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 73
+  ; GFX908-NEXT:   [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 74
+  ; GFX908-NEXT:   [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 75
+  ; GFX908-NEXT:   [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 76
+  ; GFX908-NEXT:   [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
+  ; GFX908-NEXT:   [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 78
+  ; GFX908-NEXT:   [[S_MOV_B32_80:%[0-9]+]]:sgpr_32 = S_MOV_B32 79
+  ; GFX908-NEXT:   [[S_MOV_B32_81:%[0-9]+]]:sgpr_32 = S_MOV_B32 80
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -1749,8 +1749,8 @@ body:             |
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = S_ADD_I32 %4.sub0, -1, implicit-def dead $scc
-  ; GFX908-NEXT:   S_CMP_LG_U32 %4.sub0, 0, implicit-def $scc
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
+  ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
   ; GFX908-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit killed $scc
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.4:
@@ -1771,47 +1771,47 @@ body:             |
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_19]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_4]], implicit [[S_MOV_B32_5]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_6]], implicit [[S_MOV_B32_7]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_8]], implicit [[S_MOV_B32_9]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_10]], implicit [[S_MOV_B32_11]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_12]], implicit [[S_MOV_B32_13]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_14]], implicit [[S_MOV_B32_15]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_16]], implicit [[S_MOV_B32_17]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_18]], implicit [[S_MOV_B32_19]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_20]], implicit [[S_MOV_B32_21]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_22]], implicit [[S_MOV_B32_23]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_24]], implicit [[S_MOV_B32_25]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_26]], implicit [[S_MOV_B32_27]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_28]], implicit [[S_MOV_B32_29]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_30]], implicit [[S_MOV_B32_31]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_32]], implicit [[S_MOV_B32_33]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_34]], implicit [[S_MOV_B32_35]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_36]], implicit [[S_MOV_B32_37]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_38]], implicit [[S_MOV_B32_39]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_40]], implicit [[S_MOV_B32_41]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_42]], implicit [[S_MOV_B32_43]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_44]], implicit [[S_MOV_B32_45]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_46]], implicit [[S_MOV_B32_47]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_48]], implicit [[S_MOV_B32_49]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_50]], implicit [[S_MOV_B32_51]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_52]], implicit [[S_MOV_B32_53]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_54]], implicit [[S_MOV_B32_55]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_56]], implicit [[S_MOV_B32_57]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_58]], implicit [[S_MOV_B32_59]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_60]], implicit [[S_MOV_B32_61]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_62]], implicit [[S_MOV_B32_63]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_64]], implicit [[S_MOV_B32_65]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_66]], implicit [[S_MOV_B32_67]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_68]], implicit [[S_MOV_B32_69]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_70]], implicit [[S_MOV_B32_71]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_72]], implicit [[S_MOV_B32_73]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_74]], implicit [[S_MOV_B32_75]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_76]], implicit [[S_MOV_B32_77]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_78]], implicit [[S_MOV_B32_79]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_80]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_5]], implicit [[S_MOV_B32_6]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_7]], implicit [[S_MOV_B32_8]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_9]], implicit [[S_MOV_B32_10]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_11]], implicit [[S_MOV_B32_12]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_13]], implicit [[S_MOV_B32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_15]], implicit [[S_MOV_B32_16]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_17]], implicit [[S_MOV_B32_18]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_19]], implicit [[S_MOV_B32_20]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_21]], implicit [[S_MOV_B32_22]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_23]], implicit [[S_MOV_B32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_25]], implicit [[S_MOV_B32_26]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_27]], implicit [[S_MOV_B32_28]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_29]], implicit [[S_MOV_B32_30]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_31]], implicit [[S_MOV_B32_32]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_33]], implicit [[S_MOV_B32_34]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_35]], implicit [[S_MOV_B32_36]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_37]], implicit [[S_MOV_B32_38]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_39]], implicit [[S_MOV_B32_40]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_41]], implicit [[S_MOV_B32_42]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_43]], implicit [[S_MOV_B32_44]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_45]], implicit [[S_MOV_B32_46]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_47]], implicit [[S_MOV_B32_48]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_49]], implicit [[S_MOV_B32_50]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_51]], implicit [[S_MOV_B32_52]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_53]], implicit [[S_MOV_B32_54]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_55]], implicit [[S_MOV_B32_56]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_57]], implicit [[S_MOV_B32_58]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_59]], implicit [[S_MOV_B32_60]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_61]], implicit [[S_MOV_B32_62]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_63]], implicit [[S_MOV_B32_64]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_65]], implicit [[S_MOV_B32_66]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_67]], implicit [[S_MOV_B32_68]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_69]], implicit [[S_MOV_B32_70]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_71]], implicit [[S_MOV_B32_72]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_73]], implicit [[S_MOV_B32_74]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_75]], implicit [[S_MOV_B32_76]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_77]], implicit [[S_MOV_B32_78]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_79]], implicit [[S_MOV_B32_80]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_81]]
   ; GFX908-NEXT:   S_ENDPGM 0
   bb.0:
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -2026,10 +2026,10 @@ body:             |
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
   ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-  ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
@@ -2053,138 +2053,138 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
-  ; GFX908-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1
-  ; GFX908-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 2
-  ; GFX908-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 3
-  ; GFX908-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 4
-  ; GFX908-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sgpr_32 = S_MOV_B32 5
-  ; GFX908-NEXT:   [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 6
-  ; GFX908-NEXT:   [[S_MOV_B32_7:%[0-9]+]]:sgpr_32 = S_MOV_B32 7
-  ; GFX908-NEXT:   [[S_MOV_B32_8:%[0-9]+]]:sgpr_32 = S_MOV_B32 8
-  ; GFX908-NEXT:   [[S_MOV_B32_9:%[0-9]+]]:sgpr_32 = S_MOV_B32 9
-  ; GFX908-NEXT:   [[S_MOV_B32_10:%[0-9]+]]:sgpr_32 = S_MOV_B32 10
-  ; GFX908-NEXT:   [[S_MOV_B32_11:%[0-9]+]]:sgpr_32 = S_MOV_B32 11
-  ; GFX908-NEXT:   [[S_MOV_B32_12:%[0-9]+]]:sgpr_32 = S_MOV_B32 12
-  ; GFX908-NEXT:   [[S_MOV_B32_13:%[0-9]+]]:sgpr_32 = S_MOV_B32 13
-  ; GFX908-NEXT:   [[S_MOV_B32_14:%[0-9]+]]:sgpr_32 = S_MOV_B32 14
-  ; GFX908-NEXT:   [[S_MOV_B32_15:%[0-9]+]]:sgpr_32 = S_MOV_B32 15
-  ; GFX908-NEXT:   [[S_MOV_B32_16:%[0-9]+]]:sgpr_32 = S_MOV_B32 16
-  ; GFX908-NEXT:   [[S_MOV_B32_17:%[0-9]+]]:sgpr_32 = S_MOV_B32 17
-  ; GFX908-NEXT:   [[S_MOV_B32_18:%[0-9]+]]:sgpr_32 = S_MOV_B32 18
-  ; GFX908-NEXT:   [[S_MOV_B32_19:%[0-9]+]]:sgpr_32 = S_MOV_B32 19
-  ; GFX908-NEXT:   [[S_MOV_B32_20:%[0-9]+]]:sgpr_32 = S_MOV_B32 20
-  ; GFX908-NEXT:   [[S_MOV_B32_21:%[0-9]+]]:sgpr_32 = S_MOV_B32 21
-  ; GFX908-NEXT:   [[S_MOV_B32_22:%[0-9]+]]:sgpr_32 = S_MOV_B32 22
-  ; GFX908-NEXT:   [[S_MOV_B32_23:%[0-9]+]]:sgpr_32 = S_MOV_B32 23
-  ; GFX908-NEXT:   [[S_MOV_B32_24:%[0-9]+]]:sgpr_32 = S_MOV_B32 24
-  ; GFX908-NEXT:   [[S_MOV_B32_25:%[0-9]+]]:sgpr_32 = S_MOV_B32 25
-  ; GFX908-NEXT:   [[S_MOV_B32_26:%[0-9]+]]:sgpr_32 = S_MOV_B32 26
-  ; GFX908-NEXT:   [[S_MOV_B32_27:%[0-9]+]]:sgpr_32 = S_MOV_B32 27
-  ; GFX908-NEXT:   [[S_MOV_B32_28:%[0-9]+]]:sgpr_32 = S_MOV_B32 28
-  ; GFX908-NEXT:   [[S_MOV_B32_29:%[0-9]+]]:sgpr_32 = S_MOV_B32 29
-  ; GFX908-NEXT:   [[S_MOV_B32_30:%[0-9]+]]:sgpr_32 = S_MOV_B32 30
-  ; GFX908-NEXT:   [[S_MOV_B32_31:%[0-9]+]]:sgpr_32 = S_MOV_B32 31
-  ; GFX908-NEXT:   [[S_MOV_B32_32:%[0-9]+]]:sgpr_32 = S_MOV_B32 32
-  ; GFX908-NEXT:   [[S_MOV_B32_33:%[0-9]+]]:sgpr_32 = S_MOV_B32 33
-  ; GFX908-NEXT:   [[S_MOV_B32_34:%[0-9]+]]:sgpr_32 = S_MOV_B32 34
-  ; GFX908-NEXT:   [[S_MOV_B32_35:%[0-9]+]]:sgpr_32 = S_MOV_B32 35
-  ; GFX908-NEXT:   [[S_MOV_B32_36:%[0-9]+]]:sgpr_32 = S_MOV_B32 36
-  ; GFX908-NEXT:   [[S_MOV_B32_37:%[0-9]+]]:sgpr_32 = S_MOV_B32 37
-  ; GFX908-NEXT:   [[S_MOV_B32_38:%[0-9]+]]:sgpr_32 = S_MOV_B32 38
-  ; GFX908-NEXT:   [[S_MOV_B32_39:%[0-9]+]]:sgpr_32 = S_MOV_B32 39
-  ; GFX908-NEXT:   [[S_MOV_B32_40:%[0-9]+]]:sgpr_32 = S_MOV_B32 40
-  ; GFX908-NEXT:   [[S_MOV_B32_41:%[0-9]+]]:sgpr_32 = S_MOV_B32 41
-  ; GFX908-NEXT:   [[S_MOV_B32_42:%[0-9]+]]:sgpr_32 = S_MOV_B32 42
-  ; GFX908-NEXT:   [[S_MOV_B32_43:%[0-9]+]]:sgpr_32 = S_MOV_B32 43
-  ; GFX908-NEXT:   [[S_MOV_B32_44:%[0-9]+]]:sgpr_32 = S_MOV_B32 44
-  ; GFX908-NEXT:   [[S_MOV_B32_45:%[0-9]+]]:sgpr_32 = S_MOV_B32 45
-  ; GFX908-NEXT:   [[S_MOV_B32_46:%[0-9]+]]:sgpr_32 = S_MOV_B32 46
-  ; GFX908-NEXT:   [[S_MOV_B32_47:%[0-9]+]]:sgpr_32 = S_MOV_B32 47
-  ; GFX908-NEXT:   [[S_MOV_B32_48:%[0-9]+]]:sgpr_32 = S_MOV_B32 48
-  ; GFX908-NEXT:   [[S_MOV_B32_49:%[0-9]+]]:sgpr_32 = S_MOV_B32 49
-  ; GFX908-NEXT:   [[S_MOV_B32_50:%[0-9]+]]:sgpr_32 = S_MOV_B32 50
-  ; GFX908-NEXT:   [[S_MOV_B32_51:%[0-9]+]]:sgpr_32 = S_MOV_B32 51
-  ; GFX908-NEXT:   [[S_MOV_B32_52:%[0-9]+]]:sgpr_32 = S_MOV_B32 52
-  ; GFX908-NEXT:   [[S_MOV_B32_53:%[0-9]+]]:sgpr_32 = S_MOV_B32 53
-  ; GFX908-NEXT:   [[S_MOV_B32_54:%[0-9]+]]:sgpr_32 = S_MOV_B32 54
-  ; GFX908-NEXT:   [[S_MOV_B32_55:%[0-9]+]]:sgpr_32 = S_MOV_B32 55
-  ; GFX908-NEXT:   [[S_MOV_B32_56:%[0-9]+]]:sgpr_32 = S_MOV_B32 56
-  ; GFX908-NEXT:   [[S_MOV_B32_57:%[0-9]+]]:sgpr_32 = S_MOV_B32 57
-  ; GFX908-NEXT:   [[S_MOV_B32_58:%[0-9]+]]:sgpr_32 = S_MOV_B32 58
-  ; GFX908-NEXT:   [[S_MOV_B32_59:%[0-9]+]]:sgpr_32 = S_MOV_B32 59
-  ; GFX908-NEXT:   [[S_MOV_B32_60:%[0-9]+]]:sgpr_32 = S_MOV_B32 60
-  ; GFX908-NEXT:   [[S_MOV_B32_61:%[0-9]+]]:sgpr_32 = S_MOV_B32 61
-  ; GFX908-NEXT:   [[S_MOV_B32_62:%[0-9]+]]:sgpr_32 = S_MOV_B32 62
-  ; GFX908-NEXT:   [[S_MOV_B32_63:%[0-9]+]]:sgpr_32 = S_MOV_B32 63
-  ; GFX908-NEXT:   [[S_MOV_B32_64:%[0-9]+]]:sgpr_32 = S_MOV_B32 64
-  ; GFX908-NEXT:   [[S_MOV_B32_65:%[0-9]+]]:sgpr_32 = S_MOV_B32 65
-  ; GFX908-NEXT:   [[S_MOV_B32_66:%[0-9]+]]:sgpr_32 = S_MOV_B32 66
-  ; GFX908-NEXT:   [[S_MOV_B32_67:%[0-9]+]]:sgpr_32 = S_MOV_B32 67
-  ; GFX908-NEXT:   [[S_MOV_B32_68:%[0-9]+]]:sgpr_32 = S_MOV_B32 68
-  ; GFX908-NEXT:   [[S_MOV_B32_69:%[0-9]+]]:sgpr_32 = S_MOV_B32 69
-  ; GFX908-NEXT:   [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 70
-  ; GFX908-NEXT:   [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 71
-  ; GFX908-NEXT:   [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 72
-  ; GFX908-NEXT:   [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 73
-  ; GFX908-NEXT:   [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 74
-  ; GFX908-NEXT:   [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 75
-  ; GFX908-NEXT:   [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 76
-  ; GFX908-NEXT:   [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
-  ; GFX908-NEXT:   [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 78
-  ; GFX908-NEXT:   [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 79
-  ; GFX908-NEXT:   [[S_MOV_B32_80:%[0-9]+]]:sgpr_32 = S_MOV_B32 80
-  ; GFX908-NEXT:   [[S_MOV_B32_81:%[0-9]+]]:sgpr_32 = S_MOV_B32 81
-  ; GFX908-NEXT:   [[S_MOV_B32_82:%[0-9]+]]:sgpr_32 = S_MOV_B32 82
-  ; GFX908-NEXT:   [[S_MOV_B32_83:%[0-9]+]]:sgpr_32 = S_MOV_B32 83
-  ; GFX908-NEXT:   [[S_MOV_B32_84:%[0-9]+]]:sgpr_32 = S_MOV_B32 84
+  ; GFX908-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; GFX908-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1
+  ; GFX908-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 2
+  ; GFX908-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 3
+  ; GFX908-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sgpr_32 = S_MOV_B32 4
+  ; GFX908-NEXT:   [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 5
+  ; GFX908-NEXT:   [[S_MOV_B32_7:%[0-9]+]]:sgpr_32 = S_MOV_B32 6
+  ; GFX908-NEXT:   [[S_MOV_B32_8:%[0-9]+]]:sgpr_32 = S_MOV_B32 7
+  ; GFX908-NEXT:   [[S_MOV_B32_9:%[0-9]+]]:sgpr_32 = S_MOV_B32 8
+  ; GFX908-NEXT:   [[S_MOV_B32_10:%[0-9]+]]:sgpr_32 = S_MOV_B32 9
+  ; GFX908-NEXT:   [[S_MOV_B32_11:%[0-9]+]]:sgpr_32 = S_MOV_B32 10
+  ; GFX908-NEXT:   [[S_MOV_B32_12:%[0-9]+]]:sgpr_32 = S_MOV_B32 11
+  ; GFX908-NEXT:   [[S_MOV_B32_13:%[0-9]+]]:sgpr_32 = S_MOV_B32 12
+  ; GFX908-NEXT:   [[S_MOV_B32_14:%[0-9]+]]:sgpr_32 = S_MOV_B32 13
+  ; GFX908-NEXT:   [[S_MOV_B32_15:%[0-9]+]]:sgpr_32 = S_MOV_B32 14
+  ; GFX908-NEXT:   [[S_MOV_B32_16:%[0-9]+]]:sgpr_32 = S_MOV_B32 15
+  ; GFX908-NEXT:   [[S_MOV_B32_17:%[0-9]+]]:sgpr_32 = S_MOV_B32 16
+  ; GFX908-NEXT:   [[S_MOV_B32_18:%[0-9]+]]:sgpr_32 = S_MOV_B32 17
+  ; GFX908-NEXT:   [[S_MOV_B32_19:%[0-9]+]]:sgpr_32 = S_MOV_B32 18
+  ; GFX908-NEXT:   [[S_MOV_B32_20:%[0-9]+]]:sgpr_32 = S_MOV_B32 19
+  ; GFX908-NEXT:   [[S_MOV_B32_21:%[0-9]+]]:sgpr_32 = S_MOV_B32 20
+  ; GFX908-NEXT:   [[S_MOV_B32_22:%[0-9]+]]:sgpr_32 = S_MOV_B32 21
+  ; GFX908-NEXT:   [[S_MOV_B32_23:%[0-9]+]]:sgpr_32 = S_MOV_B32 22
+  ; GFX908-NEXT:   [[S_MOV_B32_24:%[0-9]+]]:sgpr_32 = S_MOV_B32 23
+  ; GFX908-NEXT:   [[S_MOV_B32_25:%[0-9]+]]:sgpr_32 = S_MOV_B32 24
+  ; GFX908-NEXT:   [[S_MOV_B32_26:%[0-9]+]]:sgpr_32 = S_MOV_B32 25
+  ; GFX908-NEXT:   [[S_MOV_B32_27:%[0-9]+]]:sgpr_32 = S_MOV_B32 26
+  ; GFX908-NEXT:   [[S_MOV_B32_28:%[0-9]+]]:sgpr_32 = S_MOV_B32 27
+  ; GFX908-NEXT:   [[S_MOV_B32_29:%[0-9]+]]:sgpr_32 = S_MOV_B32 28
+  ; GFX908-NEXT:   [[S_MOV_B32_30:%[0-9]+]]:sgpr_32 = S_MOV_B32 29
+  ; GFX908-NEXT:   [[S_MOV_B32_31:%[0-9]+]]:sgpr_32 = S_MOV_B32 30
+  ; GFX908-NEXT:   [[S_MOV_B32_32:%[0-9]+]]:sgpr_32 = S_MOV_B32 31
+  ; GFX908-NEXT:   [[S_MOV_B32_33:%[0-9]+]]:sgpr_32 = S_MOV_B32 32
+  ; GFX908-NEXT:   [[S_MOV_B32_34:%[0-9]+]]:sgpr_32 = S_MOV_B32 33
+  ; GFX908-NEXT:   [[S_MOV_B32_35:%[0-9]+]]:sgpr_32 = S_MOV_B32 34
+  ; GFX908-NEXT:   [[S_MOV_B32_36:%[0-9]+]]:sgpr_32 = S_MOV_B32 35
+  ; GFX908-NEXT:   [[S_MOV_B32_37:%[0-9]+]]:sgpr_32 = S_MOV_B32 36
+  ; GFX908-NEXT:   [[S_MOV_B32_38:%[0-9]+]]:sgpr_32 = S_MOV_B32 37
+  ; GFX908-NEXT:   [[S_MOV_B32_39:%[0-9]+]]:sgpr_32 = S_MOV_B32 38
+  ; GFX908-NEXT:   [[S_MOV_B32_40:%[0-9]+]]:sgpr_32 = S_MOV_B32 39
+  ; GFX908-NEXT:   [[S_MOV_B32_41:%[0-9]+]]:sgpr_32 = S_MOV_B32 40
+  ; GFX908-NEXT:   [[S_MOV_B32_42:%[0-9]+]]:sgpr_32 = S_MOV_B32 41
+  ; GFX908-NEXT:   [[S_MOV_B32_43:%[0-9]+]]:sgpr_32 = S_MOV_B32 42
+  ; GFX908-NEXT:   [[S_MOV_B32_44:%[0-9]+]]:sgpr_32 = S_MOV_B32 43
+  ; GFX908-NEXT:   [[S_MOV_B32_45:%[0-9]+]]:sgpr_32 = S_MOV_B32 44
+  ; GFX908-NEXT:   [[S_MOV_B32_46:%[0-9]+]]:sgpr_32 = S_MOV_B32 45
+  ; GFX908-NEXT:   [[S_MOV_B32_47:%[0-9]+]]:sgpr_32 = S_MOV_B32 46
+  ; GFX908-NEXT:   [[S_MOV_B32_48:%[0-9]+]]:sgpr_32 = S_MOV_B32 47
+  ; GFX908-NEXT:   [[S_MOV_B32_49:%[0-9]+]]:sgpr_32 = S_MOV_B32 48
+  ; GFX908-NEXT:   [[S_MOV_B32_50:%[0-9]+]]:sgpr_32 = S_MOV_B32 49
+  ; GFX908-NEXT:   [[S_MOV_B32_51:%[0-9]+]]:sgpr_32 = S_MOV_B32 50
+  ; GFX908-NEXT:   [[S_MOV_B32_52:%[0-9]+]]:sgpr_32 = S_MOV_B32 51
+  ; GFX908-NEXT:   [[S_MOV_B32_53:%[0-9]+]]:sgpr_32 = S_MOV_B32 52
+  ; GFX908-NEXT:   [[S_MOV_B32_54:%[0-9]+]]:sgpr_32 = S_MOV_B32 53
+  ; GFX908-NEXT:   [[S_MOV_B32_55:%[0-9]+]]:sgpr_32 = S_MOV_B32 54
+  ; GFX908-NEXT:   [[S_MOV_B32_56:%[0-9]+]]:sgpr_32 = S_MOV_B32 55
+  ; GFX908-NEXT:   [[S_MOV_B32_57:%[0-9]+]]:sgpr_32 = S_MOV_B32 56
+  ; GFX908-NEXT:   [[S_MOV_B32_58:%[0-9]+]]:sgpr_32 = S_MOV_B32 57
+  ; GFX908-NEXT:   [[S_MOV_B32_59:%[0-9]+]]:sgpr_32 = S_MOV_B32 58
+  ; GFX908-NEXT:   [[S_MOV_B32_60:%[0-9]+]]:sgpr_32 = S_MOV_B32 59
+  ; GFX908-NEXT:   [[S_MOV_B32_61:%[0-9]+]]:sgpr_32 = S_MOV_B32 60
+  ; GFX908-NEXT:   [[S_MOV_B32_62:%[0-9]+]]:sgpr_32 = S_MOV_B32 61
+  ; GFX908-NEXT:   [[S_MOV_B32_63:%[0-9]+]]:sgpr_32 = S_MOV_B32 62
+  ; GFX908-NEXT:   [[S_MOV_B32_64:%[0-9]+]]:sgpr_32 = S_MOV_B32 63
+  ; GFX908-NEXT:   [[S_MOV_B32_65:%[0-9]+]]:sgpr_32 = S_MOV_B32 64
+  ; GFX908-NEXT:   [[S_MOV_B32_66:%[0-9]+]]:sgpr_32 = S_MOV_B32 65
+  ; GFX908-NEXT:   [[S_MOV_B32_67:%[0-9]+]]:sgpr_32 = S_MOV_B32 66
+  ; GFX908-NEXT:   [[S_MOV_B32_68:%[0-9]+]]:sgpr_32 = S_MOV_B32 67
+  ; GFX908-NEXT:   [[S_MOV_B32_69:%[0-9]+]]:sgpr_32 = S_MOV_B32 68
+  ; GFX908-NEXT:   [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 69
+  ; GFX908-NEXT:   [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 70
+  ; GFX908-NEXT:   [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 71
+  ; GFX908-NEXT:   [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 72
+  ; GFX908-NEXT:   [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 73
+  ; GFX908-NEXT:   [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 74
+  ; GFX908-NEXT:   [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 75
+  ; GFX908-NEXT:   [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 76
+  ; GFX908-NEXT:   [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
+  ; GFX908-NEXT:   [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 78
+  ; GFX908-NEXT:   [[S_MOV_B32_80:%[0-9]+]]:sgpr_32 = S_MOV_B32 79
+  ; GFX908-NEXT:   [[S_MOV_B32_81:%[0-9]+]]:sgpr_32 = S_MOV_B32 80
+  ; GFX908-NEXT:   [[S_MOV_B32_82:%[0-9]+]]:sgpr_32 = S_MOV_B32 81
+  ; GFX908-NEXT:   [[S_MOV_B32_83:%[0-9]+]]:sgpr_32 = S_MOV_B32 82
+  ; GFX908-NEXT:   [[S_MOV_B32_84:%[0-9]+]]:sgpr_32 = S_MOV_B32 83
+  ; GFX908-NEXT:   [[S_MOV_B32_85:%[0-9]+]]:sgpr_32 = S_MOV_B32 84
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_4]], implicit [[S_MOV_B32_5]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_6]], implicit [[S_MOV_B32_7]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_8]], implicit [[S_MOV_B32_9]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_10]], implicit [[S_MOV_B32_11]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_12]], implicit [[S_MOV_B32_13]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_14]], implicit [[S_MOV_B32_15]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_16]], implicit [[S_MOV_B32_17]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_18]], implicit [[S_MOV_B32_19]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_20]], implicit [[S_MOV_B32_21]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_22]], implicit [[S_MOV_B32_23]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_24]], implicit [[S_MOV_B32_25]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_26]], implicit [[S_MOV_B32_27]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_28]], implicit [[S_MOV_B32_29]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_30]], implicit [[S_MOV_B32_31]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_32]], implicit [[S_MOV_B32_33]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_34]], implicit [[S_MOV_B32_35]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_36]], implicit [[S_MOV_B32_37]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_38]], implicit [[S_MOV_B32_39]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_40]], implicit [[S_MOV_B32_41]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_42]], implicit [[S_MOV_B32_43]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_44]], implicit [[S_MOV_B32_45]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_46]], implicit [[S_MOV_B32_47]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_48]], implicit [[S_MOV_B32_49]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_50]], implicit [[S_MOV_B32_51]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_52]], implicit [[S_MOV_B32_53]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_54]], implicit [[S_MOV_B32_55]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_56]], implicit [[S_MOV_B32_57]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_58]], implicit [[S_MOV_B32_59]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_60]], implicit [[S_MOV_B32_61]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_62]], implicit [[S_MOV_B32_63]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_64]], implicit [[S_MOV_B32_65]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_66]], implicit [[S_MOV_B32_67]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_68]], implicit [[S_MOV_B32_69]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_70]], implicit [[S_MOV_B32_71]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_72]], implicit [[S_MOV_B32_73]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_74]], implicit [[S_MOV_B32_75]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_76]], implicit [[S_MOV_B32_77]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_78]], implicit [[S_MOV_B32_79]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_80]], implicit [[S_MOV_B32_81]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_82]], implicit [[S_MOV_B32_83]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_84]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_5]], implicit [[S_MOV_B32_6]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_7]], implicit [[S_MOV_B32_8]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_9]], implicit [[S_MOV_B32_10]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_11]], implicit [[S_MOV_B32_12]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_13]], implicit [[S_MOV_B32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_15]], implicit [[S_MOV_B32_16]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_17]], implicit [[S_MOV_B32_18]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_19]], implicit [[S_MOV_B32_20]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_21]], implicit [[S_MOV_B32_22]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_23]], implicit [[S_MOV_B32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_25]], implicit [[S_MOV_B32_26]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_27]], implicit [[S_MOV_B32_28]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_29]], implicit [[S_MOV_B32_30]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_31]], implicit [[S_MOV_B32_32]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_33]], implicit [[S_MOV_B32_34]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_35]], implicit [[S_MOV_B32_36]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_37]], implicit [[S_MOV_B32_38]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_39]], implicit [[S_MOV_B32_40]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_41]], implicit [[S_MOV_B32_42]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_43]], implicit [[S_MOV_B32_44]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_45]], implicit [[S_MOV_B32_46]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_47]], implicit [[S_MOV_B32_48]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_49]], implicit [[S_MOV_B32_50]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_51]], implicit [[S_MOV_B32_52]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_53]], implicit [[S_MOV_B32_54]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_55]], implicit [[S_MOV_B32_56]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_57]], implicit [[S_MOV_B32_58]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_59]], implicit [[S_MOV_B32_60]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_61]], implicit [[S_MOV_B32_62]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_63]], implicit [[S_MOV_B32_64]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_65]], implicit [[S_MOV_B32_66]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_67]], implicit [[S_MOV_B32_68]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_69]], implicit [[S_MOV_B32_70]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_71]], implicit [[S_MOV_B32_72]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_73]], implicit [[S_MOV_B32_74]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_75]], implicit [[S_MOV_B32_76]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_77]], implicit [[S_MOV_B32_78]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_79]], implicit [[S_MOV_B32_80]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_81]], implicit [[S_MOV_B32_82]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_83]], implicit [[S_MOV_B32_84]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_85]]
   ; GFX908-NEXT:   [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
   ; GFX908-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_GT_U32_e64_]], implicit-def dead $scc
   ; GFX908-NEXT:   $exec = S_MOV_B64_term [[S_AND_B64_]]
@@ -2202,8 +2202,8 @@ body:             |
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = S_ADD_I32 %4.sub0, -1, implicit-def dead $scc
-  ; GFX908-NEXT:   S_CMP_LG_U32 %4.sub0, 0, implicit-def $scc
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
+  ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
   ; GFX908-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit killed $scc
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.4:
@@ -2599,15 +2599,15 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   undef %21.sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   %21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   undef [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   %21.sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   %21.sub3:vreg_128 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   S_NOP 0, implicit %21
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub3:vreg_128 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_21]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
@@ -2823,8 +2823,8 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
-  ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -2846,14 +2846,14 @@ body:             |
   ; GFX908-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = S_ADD_I32 %4.sub0, -1, implicit-def dead $scc
-  ; GFX908-NEXT:   S_CMP_LG_U32 %4.sub0, 0, implicit-def $scc
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
+  ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
   ; GFX908-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit killed $scc
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.4:
   ; GFX908-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
   ; GFX908-NEXT:   S_BRANCH %bb.1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.5:
@@ -2989,8 +2989,8 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -3012,8 +3012,8 @@ body:             |
   ; GFX908-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = S_ADD_I32 %4.sub0, -1, implicit-def dead $scc
-  ; GFX908-NEXT:   S_CMP_LG_U32 %4.sub0, 0, implicit-def $scc
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
+  ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
   ; GFX908-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit killed $scc
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.4:
@@ -3166,8 +3166,8 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -3190,8 +3190,8 @@ body:             |
   ; GFX908-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = S_ADD_I32 %4.sub0, -1, implicit-def dead $scc
-  ; GFX908-NEXT:   S_CMP_LG_U32 %4.sub0, 0, implicit-def $scc
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
+  ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
   ; GFX908-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit killed $scc
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.4:
@@ -3356,8 +3356,8 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -3380,8 +3380,8 @@ body:             |
   ; GFX908-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = S_ADD_I32 %4.sub0, -1, implicit-def dead $scc
-  ; GFX908-NEXT:   S_CMP_LG_U32 %4.sub0, 0, implicit-def $scc
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
+  ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
   ; GFX908-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit killed $scc
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.4:
@@ -3864,10 +3864,10 @@ body:             |
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
   ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-  ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
@@ -3952,8 +3952,8 @@ body:             |
   ; GFX908-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = S_ADD_I32 %4.sub0, -1, implicit-def dead $scc
-  ; GFX908-NEXT:   S_CMP_LG_U32 %4.sub0, 0, implicit-def $scc
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
+  ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
   ; GFX908-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit killed $scc
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.4:
@@ -4146,10 +4146,10 @@ body:             |
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
   ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-  ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
@@ -4254,8 +4254,8 @@ body:             |
   ; GFX908-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = S_ADD_I32 %4.sub0, -1, implicit-def dead $scc
-  ; GFX908-NEXT:   S_CMP_LG_U32 %4.sub0, 0, implicit-def $scc
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
+  ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
   ; GFX908-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit killed $scc
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.4:
@@ -4488,10 +4488,10 @@ body:             |
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
   ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-  ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
@@ -4640,8 +4640,8 @@ body:             |
   ; GFX908-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = S_ADD_I32 %4.sub0, -1, implicit-def dead $scc
-  ; GFX908-NEXT:   S_CMP_LG_U32 %4.sub0, 0, implicit-def $scc
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
+  ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
   ; GFX908-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit killed $scc
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.4:
@@ -4979,15 +4979,15 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   undef %21.sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   %21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   undef [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   %21.sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   %21.sub3:vreg_128 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   S_NOP 0, implicit %21
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub3:vreg_128 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_21]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
@@ -5617,7 +5617,7 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   undef %23.sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
+  ; GFX908-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
@@ -5626,7 +5626,7 @@ body:             |
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
-  ; GFX908-NEXT:   S_NOP 0, implicit %23.sub1
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]]
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
index 3bd113988af6..efa21052e3ae 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
@@ -32,12 +32,12 @@ body:             |
   ; GFX9-NEXT: bb.1:
   ; GFX9-NEXT:   successors: %bb.2(0x80000000)
   ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   %9:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   %10:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %9, 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   %12:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   %13:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %12, 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %9, %10, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %12, %13, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_2]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.2:
   ; GFX9-NEXT:   successors: %bb.3(0x80000000)
@@ -104,11 +104,11 @@ body:             |
   ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
   ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]]
   ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-NEXT:   %9:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   %10:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %9, 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
   ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-NEXT:   %12:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   %13:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %12, 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_2]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
   ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
   ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
@@ -118,8 +118,8 @@ body:             |
   ; GFX9-NEXT: bb.1:
   ; GFX9-NEXT:   successors: %bb.2(0x80000000)
   ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %9, %10, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %12, %13, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.2:
   ; GFX9-NEXT:   successors: %bb.3(0x80000000)
@@ -129,7 +129,7 @@ body:             |
   ; GFX9-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.3:
-  ; GFX9-NEXT:   [[V_ADD_F32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %13, %10, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_ADD_F32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_3]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
   ; GFX9-NEXT:   S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2
@@ -189,11 +189,11 @@ body:             |
   ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
   ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]]
   ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-NEXT:   %9:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   %10:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %9, 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
   ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-NEXT:   %12:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   %13:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %12, 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_2]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
   ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
   ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
@@ -203,8 +203,8 @@ body:             |
   ; GFX9-NEXT: bb.1:
   ; GFX9-NEXT:   successors: %bb.2(0x80000000)
   ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %9, %10, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %12, %13, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.2:
   ; GFX9-NEXT:   successors: %bb.3(0x80000000)
@@ -268,8 +268,8 @@ body:             |
   ; GFX9-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
   ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
   ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-NEXT:   %5:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   early-clobber %6:vgpr_32 = STRICT_WWM %5, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   early-clobber %6:vgpr_32 = STRICT_WWM [[V_FMAC_F32_e64_]], implicit $exec
   ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
   ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY3]](s32), [[S_MOV_B32_]], implicit $exec
@@ -282,7 +282,7 @@ body:             |
   ; GFX9-NEXT: bb.2:
   ; GFX9-NEXT:   successors: %bb.3(0x80000000)
   ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   S_NOP 0, implicit %5
+  ; GFX9-NEXT:   S_NOP 0, implicit [[V_FMAC_F32_e64_]]
   ; GFX9-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.3:
@@ -353,9 +353,9 @@ body:             |
   ; GFX9-NEXT:   successors: %bb.4(0x40000000), %bb.6(0x40000000)
   ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
   ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   %6:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   %8:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   S_NOP 0, implicit %6, implicit %8
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   S_NOP 0, implicit [[V_FMAC_F32_e64_]], implicit [[V_FMAC_F32_e64_1]]
   ; GFX9-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX9-NEXT:   S_CBRANCH_EXECZ %bb.6, implicit $exec
   ; GFX9-NEXT: {{  $}}
@@ -461,15 +461,15 @@ body:             |
   ; GFX9-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
   ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
   ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-NEXT:   %6:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
   ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-NEXT:   %8:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.1:
   ; GFX9-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
   ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   S_NOP 0, implicit %6, implicit %8
+  ; GFX9-NEXT:   S_NOP 0, implicit [[V_FMAC_F32_e64_]], implicit [[V_FMAC_F32_e64_1]]
   ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
   ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
@@ -591,9 +591,9 @@ body:             |
   ; GFX9-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
   ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
   ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-NEXT:   %6:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
   ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-NEXT:   %8:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
   ; GFX9-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
   ; GFX9-NEXT:   S_BRANCH %bb.1
   ; GFX9-NEXT: {{  $}}
@@ -608,7 +608,7 @@ body:             |
   ; GFX9-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
   ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
   ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   S_NOP 0, implicit %6, implicit %8
+  ; GFX9-NEXT:   S_NOP 0, implicit [[V_FMAC_F32_e64_]], implicit [[V_FMAC_F32_e64_1]]
   ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
   ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/merge-s-load.mir b/llvm/test/CodeGen/AMDGPU/merge-s-load.mir
index 027b94e2f820..b08da2e1848f 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-s-load.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-s-load.mir
@@ -52,6 +52,7 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
     ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32))
+    ;
     ; GFX12-LABEL: name: merge_s_load_x1_x1_x1
     ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
     ; GFX12-NEXT: [[S_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_LOAD_DWORDX3_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 4)
@@ -78,6 +79,7 @@ body: |
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub1
     ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY1]].sub0
     ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY1]].sub1
+    ;
     ; GFX12-LABEL: name: merge_s_load_x1_x1_x1_x1
     ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
     ; GFX12-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4)
@@ -115,6 +117,7 @@ body: |
     ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY8]].sub1
     ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY9]].sub0
     ; GFX11-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY9]].sub1
+    ;
     ; GFX12-LABEL: name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1
     ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
     ; GFX12-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
@@ -151,6 +154,7 @@ body: |
     ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
     ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sgpr_64 = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64))
     ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32))
+    ;
     ; GFX12-LABEL: name: merge_s_load_x2_x1
     ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
     ; GFX12-NEXT: [[S_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_LOAD_DWORDX3_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 8)
diff --git a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir b/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir
index 7dc78101f44b..c86b5adec372 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir
@@ -32,6 +32,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_xyz_x
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -40,6 +41,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_xyz_x
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -70,6 +72,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub2_sub3
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -78,6 +81,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -108,6 +112,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub1_sub2
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xy
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -116,6 +121,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xy
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -146,6 +152,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_x
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -154,6 +161,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_x
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -185,6 +193,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -193,6 +202,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -223,6 +233,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_format_32_32_32_32
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -231,6 +242,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 126, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_format_32_32_32_32
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -273,6 +285,7 @@ body:             |
     ; GFX9-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
     ; GFX9-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
     ; GFX9-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_float_32
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -288,6 +301,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_float_32
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -344,6 +358,7 @@ body:             |
     ; GFX9-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
     ; GFX9-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
     ; GFX9-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_sint_32
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -359,6 +374,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_sint_32
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -415,6 +431,7 @@ body:             |
     ; GFX9-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
     ; GFX9-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
     ; GFX9-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_uint_32
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -430,6 +447,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_uint_32
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -480,6 +498,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_not_merged_data_format_mismatch
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -495,6 +514,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_not_merged_data_format_mismatch
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -545,6 +565,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_not_merged_num_format_mismatch
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -560,6 +581,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_not_merged_num_format_mismatch
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -611,6 +633,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
     ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[REG_SEQUENCE1]], %subreg.sub1_sub2_sub3
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_store_x_xyz
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -626,6 +649,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_store_x_xyz
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -675,6 +699,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
     ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1_sub2, [[COPY]], %subreg.sub3
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_store_xyz_x
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -690,6 +715,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_store_xyz_x
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -740,6 +766,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX9-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE3]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_store_xy_xy
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -756,6 +783,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_store_xy_xy
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -807,6 +835,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, %10:vreg_64, %subreg.sub1_sub2
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_store_x_xy
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -822,6 +851,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact %10:vreg_64, [[REG_SEQUENCE]], 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_store_x_xy
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -872,6 +902,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_store_xy_x
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -887,6 +918,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_store_xy_x
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -935,6 +967,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_store_x_x
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -949,6 +982,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_store_x_x
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -995,6 +1029,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_store_x_x_format_32_32_32_32
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -1009,6 +1044,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 8, 126, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_store_x_x_format_32_32_32_32
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -1067,6 +1103,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
     ; GFX9-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 125, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_store_float32
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -1093,6 +1130,7 @@ body:             |
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_store_float32
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -1175,6 +1213,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
     ; GFX9-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 93, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_store_sint32
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -1201,6 +1240,7 @@ body:             |
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_store_sint32
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -1283,6 +1323,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
     ; GFX9-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 77, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_store_uint32
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -1309,6 +1350,7 @@ body:             |
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_store_uint32
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -1391,6 +1433,7 @@ body:             |
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_store_not_merged_data_format_mismatch
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -1417,6 +1460,7 @@ body:             |
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_store_not_merged_data_format_mismatch
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -1499,6 +1543,7 @@ body:             |
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_store_not_merged_num_format_mismatch
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -1525,6 +1570,7 @@ body:             |
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_store_not_merged_num_format_mismatch
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -1588,6 +1634,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_not_merged_swizzled_0
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1596,6 +1643,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_not_merged_swizzled_0
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1625,6 +1673,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_not_merged_swizzled_1
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1633,6 +1682,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_not_merged_swizzled_1
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1664,6 +1714,7 @@ body:             |
     ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_merge_across_swizzle
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1673,6 +1724,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_merge_across_swizzle
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1706,6 +1758,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 6, 116, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_merge_across_swizzled_store
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1716,6 +1769,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 6, 116, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_merge_across_swizzled_store
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1751,6 +1805,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_idxen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1760,6 +1815,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_idxen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1793,6 +1849,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 125, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN]].sub1_sub2
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xy_idxen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1802,6 +1859,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xy_idxen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1835,6 +1893,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0_sub1
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub2_sub3
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1844,6 +1903,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1877,6 +1937,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub1_sub2_sub3
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xyz_idxen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1886,6 +1947,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xyz_idxen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1919,6 +1981,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_bothen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1928,6 +1991,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_bothen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1961,6 +2025,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 125, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub1_sub2
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xy_bothen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1970,6 +2035,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xy_bothen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2003,6 +2069,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0_sub1
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub2_sub3
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2012,6 +2079,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2045,6 +2113,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1_sub2_sub3
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xyz_bothen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2054,6 +2123,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xyz_bothen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2087,6 +2157,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2096,6 +2167,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2129,6 +2201,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 125, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub1_sub2
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xy_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2138,6 +2211,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xy_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2171,6 +2245,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub0_sub1
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub2_sub3
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2180,6 +2255,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2213,6 +2289,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub1_sub2_sub3
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xyz_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2222,6 +2299,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xyz_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2257,6 +2335,7 @@ body:             |
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub2
     ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
     ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_x_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2267,6 +2346,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_x_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2303,6 +2383,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_x_idxen_exact_swizzled_0
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2313,6 +2394,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_x_idxen_exact_swizzled_0
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2348,6 +2430,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2357,6 +2440,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2390,6 +2474,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 125, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub1_sub2
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xy_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2399,6 +2484,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xy_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2433,6 +2519,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub0_sub1
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub2_sub3
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2442,6 +2529,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2476,6 +2564,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub1_sub2_sub3
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xyz_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2485,6 +2574,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xyz_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2520,6 +2610,7 @@ body:             |
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub2
     ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
     ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_x_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2530,6 +2621,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_x_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2566,6 +2658,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_x_bothen_exact_swizzled_0
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2576,6 +2669,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_x_bothen_exact_swizzled_0
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2611,6 +2705,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2621,6 +2716,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2657,6 +2753,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2668,6 +2765,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2705,6 +2803,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2715,6 +2814,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2751,6 +2851,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2762,6 +2863,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2801,6 +2903,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 8, 74, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xyz
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2810,6 +2913,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub1_sub2_sub3
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xyz
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2839,6 +2943,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_xyz_x
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2848,6 +2953,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_xyz_x
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2877,6 +2983,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2886,6 +2993,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub2_sub3
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2915,6 +3023,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xy
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2924,6 +3033,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub1_sub2
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xy
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2953,6 +3063,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_x
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2962,6 +3073,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_x
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2991,6 +3103,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3000,6 +3113,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3031,6 +3145,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 75, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_format_32_32_32_32
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3040,6 +3155,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_format_32_32_32_32
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3076,6 +3192,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_float_32
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3097,6 +3214,7 @@ body:             |
     ; GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
     ; GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
     ; GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_float_32
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3153,6 +3271,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_sint_32
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3174,6 +3293,7 @@ body:             |
     ; GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
     ; GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
     ; GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_sint_32
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3230,6 +3350,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_uint_32
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3251,6 +3372,7 @@ body:             |
     ; GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
     ; GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
     ; GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_uint_32
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3307,6 +3429,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_not_merged_data_format_mismatch
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3322,6 +3445,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_not_merged_data_format_mismatch
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3372,6 +3496,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_not_merged_num_format_mismatch
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3387,6 +3512,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_not_merged_num_format_mismatch
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3438,6 +3564,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 8, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_store_x_xyz
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -3453,6 +3580,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
     ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[REG_SEQUENCE1]], %subreg.sub1_sub2_sub3
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_store_x_xyz
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -3502,6 +3630,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_store_xyz_x
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -3517,6 +3646,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
     ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1_sub2, [[COPY]], %subreg.sub3
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_store_xyz_x
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -3567,6 +3697,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_store_xy_xy
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -3583,6 +3714,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX10-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE3]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_store_xy_xy
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -3634,6 +3766,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact %10:vreg_64, [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_store_x_xy
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -3649,6 +3782,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, %10:vreg_64, %subreg.sub1_sub2
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_store_x_xy
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -3699,6 +3833,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_store_xy_x
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -3714,6 +3849,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_store_xy_x
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -3762,6 +3898,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_store_x_x
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -3776,6 +3913,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_store_x_x
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -3822,6 +3960,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 75, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_store_x_x_format_32_32_32_32
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -3836,6 +3975,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_store_x_x_format_32_32_32_32
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -3894,6 +4034,7 @@ body:             |
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_store_float32
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -3920,6 +4061,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
     ; GFX10-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_store_float32
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -4002,6 +4144,7 @@ body:             |
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_store_sint32
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -4028,6 +4171,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
     ; GFX10-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 73, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_store_sint32
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -4110,6 +4254,7 @@ body:             |
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_store_uint32
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -4136,6 +4281,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
     ; GFX10-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 72, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_store_uint32
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -4218,6 +4364,7 @@ body:             |
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_store_not_merged_data_format_mismatch
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -4244,6 +4391,7 @@ body:             |
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_store_not_merged_data_format_mismatch
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -4326,6 +4474,7 @@ body:             |
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_store_not_merged_num_format_mismatch
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -4352,6 +4501,7 @@ body:             |
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_store_not_merged_num_format_mismatch
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -4415,6 +4565,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_not_merged_swizzled_0
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4423,6 +4574,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_not_merged_swizzled_0
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4452,6 +4604,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_not_merged_swizzled_1
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4460,6 +4613,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_not_merged_swizzled_1
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4490,6 +4644,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_merge_across_swizzle
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4500,6 +4655,7 @@ body:             |
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_merge_across_swizzle
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4533,6 +4689,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_idxen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4543,6 +4700,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_idxen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4576,6 +4734,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xy_idxen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4586,6 +4745,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 74, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN]].sub1_sub2
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xy_idxen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4618,6 +4778,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4628,6 +4789,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0_sub1
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub2_sub3
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4660,6 +4822,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xyz_idxen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4670,6 +4833,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub1_sub2_sub3
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xyz_idxen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4703,6 +4867,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_x_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4715,6 +4880,7 @@ body:             |
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub2
     ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
     ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_x_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4752,6 +4918,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_x_idxen_exact_swizzled_0
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4763,6 +4930,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_x_idxen_exact_swizzled_0
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4798,6 +4966,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_bothen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4808,6 +4977,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_bothen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4841,6 +5011,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xy_bothen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4851,6 +5022,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 74, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub1_sub2
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xy_bothen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4883,6 +5055,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4893,6 +5066,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0_sub1
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub2_sub3
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4925,6 +5099,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xyz_bothen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4935,6 +5110,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1_sub2_sub3
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xyz_bothen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4967,6 +5143,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4977,6 +5154,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5010,6 +5188,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xy_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5020,6 +5199,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 74, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub1_sub2
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xy_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5052,6 +5232,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5062,6 +5243,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub0_sub1
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub2_sub3
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5094,6 +5276,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xyz_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5104,6 +5287,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub1_sub2_sub3
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xyz_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5136,6 +5320,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5146,6 +5331,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5179,6 +5365,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xy_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5189,6 +5376,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 74, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub1_sub2
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xy_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5222,6 +5410,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5232,6 +5421,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub0_sub1
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub2_sub3
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5265,6 +5455,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xyz_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5275,6 +5466,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub1_sub2_sub3
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xyz_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5308,6 +5500,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_x_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5320,6 +5513,7 @@ body:             |
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub2
     ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
     ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_x_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5357,6 +5551,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_x_bothen_exact_swizzled_0
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5368,6 +5563,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_x_bothen_exact_swizzled_0
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5404,6 +5600,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5414,6 +5611,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5450,6 +5648,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5461,6 +5660,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5498,6 +5698,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5508,6 +5709,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5544,6 +5746,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5555,6 +5758,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5595,6 +5799,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xyz
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5603,6 +5808,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xyz
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5633,6 +5839,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_xyz_x
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5641,6 +5848,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_xyz_x
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5671,6 +5879,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5679,6 +5888,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5709,6 +5919,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xy
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5717,6 +5928,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xy
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5747,6 +5959,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_x
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5755,6 +5968,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_x
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5785,6 +5999,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5794,6 +6009,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5824,6 +6040,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 63, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_format_32_32_32_32
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5833,6 +6050,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_format_32_32_32_32
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5870,6 +6088,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_float_32
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5891,6 +6110,7 @@ body:             |
     ; GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
     ; GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
     ; GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_float_32
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5947,6 +6167,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_sint_32
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5968,6 +6189,7 @@ body:             |
     ; GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
     ; GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
     ; GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_sint_32
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -6024,6 +6246,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_uint_32
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -6045,6 +6268,7 @@ body:             |
     ; GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
     ; GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
     ; GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_uint_32
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -6101,6 +6325,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_not_merged_data_format_mismatch
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -6116,6 +6341,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_not_merged_data_format_mismatch
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -6166,6 +6392,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_not_merged_num_format_mismatch
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -6181,6 +6408,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_not_merged_num_format_mismatch
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -6232,6 +6460,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_store_x_xyz
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -6247,6 +6476,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_store_x_xyz
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -6296,6 +6526,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_store_xyz_x
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -6311,6 +6542,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_store_xyz_x
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -6361,6 +6593,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_store_xy_xy
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -6377,6 +6610,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_store_xy_xy
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -6428,6 +6662,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact %10:vreg_64, [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_store_x_xy
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -6443,6 +6678,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact %10:vreg_64, [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_store_x_xy
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -6493,6 +6729,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_store_xy_x
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -6508,6 +6745,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_store_xy_x
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -6556,6 +6794,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_store_x_x
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -6570,6 +6809,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_store_x_x
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -6616,6 +6856,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 8, 63, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_store_x_x_format_32_32_32_32
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -6630,6 +6871,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_store_x_x_format_32_32_32_32
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -6688,6 +6930,7 @@ body:             |
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_store_float32
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -6714,6 +6957,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
     ; GFX10-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_store_float32
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -6796,6 +7040,7 @@ body:             |
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_store_sint32
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -6822,6 +7067,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
     ; GFX10-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 73, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_store_sint32
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -6904,6 +7150,7 @@ body:             |
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_store_uint32
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -6930,6 +7177,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
     ; GFX10-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 72, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_store_uint32
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -7012,6 +7260,7 @@ body:             |
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 30, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 32, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_store_not_merged_data_format_mismatch
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -7038,6 +7287,7 @@ body:             |
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 30, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 32, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_store_not_merged_data_format_mismatch
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -7120,6 +7370,7 @@ body:             |
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 30, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 32, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_store_not_merged_num_format_mismatch
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -7146,6 +7397,7 @@ body:             |
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 30, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 32, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_store_not_merged_num_format_mismatch
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -7209,6 +7461,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_not_merged_swizzled_0
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7217,6 +7470,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_not_merged_swizzled_0
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7246,6 +7500,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_not_merged_swizzled_1
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7254,6 +7509,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_not_merged_swizzled_1
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7284,6 +7540,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_merge_across_swizzle
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7294,6 +7551,7 @@ body:             |
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_merge_across_swizzle
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7327,6 +7585,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_idxen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7337,6 +7596,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_idxen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7370,6 +7630,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xy_idxen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7379,6 +7640,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xy_idxen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7412,6 +7674,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7421,6 +7684,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7454,6 +7718,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xyz_idxen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7463,6 +7728,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xyz_idxen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7496,6 +7762,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_bothen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7506,6 +7773,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_bothen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7539,6 +7807,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xy_bothen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7548,6 +7817,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xy_bothen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7581,6 +7851,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7590,6 +7861,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7623,6 +7895,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xyz_bothen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7632,6 +7905,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xyz_bothen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7665,6 +7939,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7675,6 +7950,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7708,6 +7984,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xy_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7717,6 +7994,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xy_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7750,6 +8028,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7759,6 +8038,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7792,6 +8072,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xyz_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7801,6 +8082,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xyz_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7835,6 +8117,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_x_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7847,6 +8130,7 @@ body:             |
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub2
     ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
     ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_x_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7884,6 +8168,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_x_idxen_exact_swizzled_0
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7895,6 +8180,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_x_idxen_exact_swizzled_0
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7930,6 +8216,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7940,6 +8227,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7973,6 +8261,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xy_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7982,6 +8271,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xy_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8016,6 +8306,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8025,6 +8316,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8059,6 +8351,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xyz_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8068,6 +8361,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xyz_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8102,6 +8396,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8112,6 +8407,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8149,6 +8445,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8160,6 +8457,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8197,6 +8495,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8207,6 +8506,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8243,6 +8543,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8254,6 +8555,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8291,6 +8593,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_x_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8303,6 +8606,7 @@ body:             |
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub2
     ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
     ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_x_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8340,6 +8644,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_x_bothen_exact_swizzled_0
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8351,6 +8656,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_x_bothen_exact_swizzled_0
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/mode-register-fptrunc.mir b/llvm/test/CodeGen/AMDGPU/mode-register-fptrunc.mir
index d967d76008c9..fda33f52bf06 100644
--- a/llvm/test/CodeGen/AMDGPU/mode-register-fptrunc.mir
+++ b/llvm/test/CodeGen/AMDGPU/mode-register-fptrunc.mir
@@ -16,6 +16,7 @@ body: |
     ; CHECK-NEXT: S_SETREG_IMM32_B32 1, 129, implicit-def $mode, implicit $mode
     ; CHECK-NEXT: $vgpr1 = V_CVT_F16_F32_e32 $vgpr0, implicit $mode, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0
+    ;
     ; GFX11-LABEL: name: ftrunc_upward
     ; GFX11: liveins: $sgpr0
     ; GFX11-NEXT: {{  $}}
@@ -40,6 +41,7 @@ body: |
     ; CHECK-NEXT: S_SETREG_IMM32_B32 1, 193, implicit-def $mode, implicit $mode
     ; CHECK-NEXT: $vgpr0 = V_CVT_F16_F32_e32 $vgpr1, implicit $mode, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0
+    ;
     ; GFX11-LABEL: name: ftrunc_downward
     ; GFX11: liveins: $sgpr0
     ; GFX11-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-lshlrev.mir b/llvm/test/CodeGen/AMDGPU/move-to-valu-lshlrev.mir
index 5c45cca40b0e..d19318ceb55c 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-lshlrev.mir
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-lshlrev.mir
@@ -14,6 +14,7 @@ body:             |
     ; GFX8-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
     ; GFX8-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
     ; GFX8-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64_e64 [[DEF4]], [[V_LSHL_ADD_U64_e64_]], implicit $exec
+    ;
     ; GFX12-LABEL: name: lshlrev_b64
     ; GFX12: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
     ; GFX12-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll
index 5385d63ece45..003c3ea7fce1 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll
@@ -67,6 +67,7 @@ define amdgpu_ps float @vimage_move_to_valu(<8 x i32> %rsrc) {
   ; GFX11-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_1]]
   ; GFX11-NEXT:   $vgpr0 = COPY [[IMAGE_LOAD_V1_V2_gfx11_]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX12-LABEL: name: vimage_move_to_valu
   ; GFX12: bb.0.bb:
   ; GFX12-NEXT:   successors: %bb.1(0x80000000)
@@ -200,6 +201,7 @@ define amdgpu_ps float @vsample_move_to_valu_rsrc(<8 x i32> %rsrc, <4 x i32> inr
   ; GFX11-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
   ; GFX11-NEXT:   $vgpr0 = COPY [[IMAGE_SAMPLE_V1_V1_gfx11_]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX12-LABEL: name: vsample_move_to_valu_rsrc
   ; GFX12: bb.0.main_body:
   ; GFX12-NEXT:   successors: %bb.1(0x80000000)
@@ -324,6 +326,7 @@ define amdgpu_ps float @vsample_move_to_valu_samp(<8 x i32> inreg %rsrc, <4 x i3
   ; GFX11-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
   ; GFX11-NEXT:   $vgpr0 = COPY [[IMAGE_SAMPLE_V1_V1_gfx11_]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX12-LABEL: name: vsample_move_to_valu_samp
   ; GFX12: bb.0.main_body:
   ; GFX12-NEXT:   successors: %bb.1(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
index c059b6daf0d5..ece2e1b653d3 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
@@ -70,6 +70,7 @@ body:             |
     ; W64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
     ; W64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_IDXEN]]
     ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
     ; W32-LABEL: name: idxen
     ; W32: successors: %bb.1(0x80000000)
     ; W32-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
@@ -184,6 +185,7 @@ body:             |
     ; W64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
     ; W64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]]
     ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
     ; W32-LABEL: name: offen
     ; W32: successors: %bb.1(0x80000000)
     ; W32-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
@@ -298,6 +300,7 @@ body:             |
     ; W64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
     ; W64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_BOTHEN]]
     ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
     ; W32-LABEL: name: bothen
     ; W32: successors: %bb.1(0x80000000)
     ; W32-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
@@ -394,6 +397,7 @@ body:             |
     ; ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
     ; ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64_]]
     ; ADDR64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
     ; W32-LABEL: name: addr64
     ; W32: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
     ; W32-NEXT: {{  $}}
@@ -471,6 +475,7 @@ body:             |
     ; ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
     ; ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64_]]
     ; ADDR64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
     ; W64-NO-ADDR64-LABEL: name: offset
     ; W64-NO-ADDR64: successors: %bb.1(0x80000000)
     ; W64-NO-ADDR64-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
@@ -515,6 +520,7 @@ body:             |
     ; W64-NO-ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
     ; W64-NO-ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFSET]]
     ; W64-NO-ADDR64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
     ; W32-LABEL: name: offset
     ; W32: successors: %bb.1(0x80000000)
     ; W32-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
diff --git a/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir b/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir
index 091cf2ba44b5..3de258bb52a5 100644
--- a/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir
+++ b/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir
@@ -12,17 +12,21 @@ body: |
     ; gfx908-DEFAULT-LABEL: name: mfma_padding_2_pass
     ; gfx908-DEFAULT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD25-LABEL: name: mfma_padding_2_pass
     ; gfx908-PAD25: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD25-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD50-LABEL: name: mfma_padding_2_pass
     ; gfx908-PAD50: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD50-NEXT: S_NOP 0
     ; gfx908-PAD50-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD75-LABEL: name: mfma_padding_2_pass
     ; gfx908-PAD75: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD75-NEXT: S_NOP 0
     ; gfx908-PAD75-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD100-LABEL: name: mfma_padding_2_pass
     ; gfx908-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD100-NEXT: S_NOP 1
@@ -39,18 +43,22 @@ body: |
     ; gfx908-DEFAULT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD25-LABEL: name: mfma_padding_2_pass_1_intervening_valu
     ; gfx908-PAD25: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD25-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD25-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD50-LABEL: name: mfma_padding_2_pass_1_intervening_valu
     ; gfx908-PAD50: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD50-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD75-LABEL: name: mfma_padding_2_pass_1_intervening_valu
     ; gfx908-PAD75: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD75-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD75-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD100-LABEL: name: mfma_padding_2_pass_1_intervening_valu
     ; gfx908-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
@@ -69,20 +77,24 @@ body: |
     ; gfx908-DEFAULT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-DEFAULT-NEXT: DBG_VALUE
     ; gfx908-DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD25-LABEL: name: mfma_padding_2_pass_dbg
     ; gfx908-PAD25: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD25-NEXT: DBG_VALUE
     ; gfx908-PAD25-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD50-LABEL: name: mfma_padding_2_pass_dbg
     ; gfx908-PAD50: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD50-NEXT: DBG_VALUE
     ; gfx908-PAD50-NEXT: S_NOP 0
     ; gfx908-PAD50-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD75-LABEL: name: mfma_padding_2_pass_dbg
     ; gfx908-PAD75: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD75-NEXT: DBG_VALUE
     ; gfx908-PAD75-NEXT: S_NOP 0
     ; gfx908-PAD75-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD100-LABEL: name: mfma_padding_2_pass_dbg
     ; gfx908-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD100-NEXT: DBG_VALUE
@@ -100,18 +112,22 @@ body: |
     ; gfx908-DEFAULT-LABEL: name: mfma_padding_8_pass
     ; gfx908-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD25-LABEL: name: mfma_padding_8_pass
     ; gfx908-PAD25: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD25-NEXT: S_NOP 1
     ; gfx908-PAD25-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD50-LABEL: name: mfma_padding_8_pass
     ; gfx908-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD50-NEXT: S_NOP 3
     ; gfx908-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD75-LABEL: name: mfma_padding_8_pass
     ; gfx908-PAD75: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD75-NEXT: S_NOP 5
     ; gfx908-PAD75-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD100-LABEL: name: mfma_padding_8_pass
     ; gfx908-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD100-NEXT: S_NOP 7
@@ -129,23 +145,27 @@ body: |
     ; gfx908-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD25-LABEL: name: mfma_padding_8_pass_2_intervening_valu
     ; gfx908-PAD25: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD25-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD25-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD25-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD50-LABEL: name: mfma_padding_8_pass_2_intervening_valu
     ; gfx908-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD50-NEXT: S_NOP 1
     ; gfx908-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD75-LABEL: name: mfma_padding_8_pass_2_intervening_valu
     ; gfx908-PAD75: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD75-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD75-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD75-NEXT: S_NOP 3
     ; gfx908-PAD75-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD100-LABEL: name: mfma_padding_8_pass_2_intervening_valu
     ; gfx908-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
@@ -165,19 +185,23 @@ body: |
     ; gfx908-DEFAULT-LABEL: name: mfma_padding_16_pass
     ; gfx908-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD25-LABEL: name: mfma_padding_16_pass
     ; gfx908-PAD25: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD25-NEXT: S_NOP 3
     ; gfx908-PAD25-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD50-LABEL: name: mfma_padding_16_pass
     ; gfx908-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD50-NEXT: S_NOP 7
     ; gfx908-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD75-LABEL: name: mfma_padding_16_pass
     ; gfx908-PAD75: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD75-NEXT: S_NOP 7
     ; gfx908-PAD75-NEXT: S_NOP 3
     ; gfx908-PAD75-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD100-LABEL: name: mfma_padding_16_pass
     ; gfx908-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD100-NEXT: S_NOP 7
@@ -198,6 +222,7 @@ body: |
     ; gfx908-DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD25-LABEL: name: mfma_padding_16_pass_4_intervening_valu
     ; gfx908-PAD25: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD25-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
@@ -205,6 +230,7 @@ body: |
     ; gfx908-PAD25-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD25-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD25-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD50-LABEL: name: mfma_padding_16_pass_4_intervening_valu
     ; gfx908-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
@@ -213,6 +239,7 @@ body: |
     ; gfx908-PAD50-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD50-NEXT: S_NOP 3
     ; gfx908-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD75-LABEL: name: mfma_padding_16_pass_4_intervening_valu
     ; gfx908-PAD75: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD75-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
@@ -221,6 +248,7 @@ body: |
     ; gfx908-PAD75-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD75-NEXT: S_NOP 7
     ; gfx908-PAD75-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD100-LABEL: name: mfma_padding_16_pass_4_intervening_valu
     ; gfx908-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
@@ -261,6 +289,7 @@ body: |
     ; gfx908-DEFAULT-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-DEFAULT-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD25-LABEL: name: mfma_padding_16_pass_16_intervening_valu
     ; gfx908-PAD25: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD25-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
@@ -280,6 +309,7 @@ body: |
     ; gfx908-PAD25-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD25-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD25-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD50-LABEL: name: mfma_padding_16_pass_16_intervening_valu
     ; gfx908-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
@@ -299,6 +329,7 @@ body: |
     ; gfx908-PAD50-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD50-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD75-LABEL: name: mfma_padding_16_pass_16_intervening_valu
     ; gfx908-PAD75: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD75-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
@@ -318,6 +349,7 @@ body: |
     ; gfx908-PAD75-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD75-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD75-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD100-LABEL: name: mfma_padding_16_pass_16_intervening_valu
     ; gfx908-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
@@ -366,15 +398,19 @@ body: |
     ; gfx908-DEFAULT-LABEL: name: mfma_padding_16_pass_occ_1
     ; gfx908-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD25-LABEL: name: mfma_padding_16_pass_occ_1
     ; gfx908-PAD25: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD25-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD50-LABEL: name: mfma_padding_16_pass_occ_1
     ; gfx908-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD75-LABEL: name: mfma_padding_16_pass_occ_1
     ; gfx908-PAD75: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD75-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD100-LABEL: name: mfma_padding_16_pass_occ_1
     ; gfx908-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
@@ -400,6 +436,7 @@ body: |
   ; gfx908-DEFAULT-NEXT: bb.2:
   ; gfx908-DEFAULT-NEXT:   $vgpr3 = V_MOV_B32_e32 1, implicit $exec
   ; gfx908-DEFAULT-NEXT:   early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+  ;
   ; gfx908-PAD25-LABEL: name: mfma_padding_16_pass_2_preds
   ; gfx908-PAD25: bb.0:
   ; gfx908-PAD25-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -416,6 +453,7 @@ body: |
   ; gfx908-PAD25-NEXT:   $vgpr3 = V_MOV_B32_e32 1, implicit $exec
   ; gfx908-PAD25-NEXT:   S_NOP 1
   ; gfx908-PAD25-NEXT:   early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+  ;
   ; gfx908-PAD50-LABEL: name: mfma_padding_16_pass_2_preds
   ; gfx908-PAD50: bb.0:
   ; gfx908-PAD50-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -432,6 +470,7 @@ body: |
   ; gfx908-PAD50-NEXT:   $vgpr3 = V_MOV_B32_e32 1, implicit $exec
   ; gfx908-PAD50-NEXT:   S_NOP 5
   ; gfx908-PAD50-NEXT:   early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+  ;
   ; gfx908-PAD75-LABEL: name: mfma_padding_16_pass_2_preds
   ; gfx908-PAD75: bb.0:
   ; gfx908-PAD75-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -449,6 +488,7 @@ body: |
   ; gfx908-PAD75-NEXT:   S_NOP 7
   ; gfx908-PAD75-NEXT:   S_NOP 1
   ; gfx908-PAD75-NEXT:   early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+  ;
   ; gfx908-PAD100-LABEL: name: mfma_padding_16_pass_2_preds
   ; gfx908-PAD100: bb.0:
   ; gfx908-PAD100-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
diff --git a/llvm/test/CodeGen/AMDGPU/no-remat-indirect-mov.mir b/llvm/test/CodeGen/AMDGPU/no-remat-indirect-mov.mir
index 64bba249db07..b501fd037574 100644
--- a/llvm/test/CodeGen/AMDGPU/no-remat-indirect-mov.mir
+++ b/llvm/test/CodeGen/AMDGPU/no-remat-indirect-mov.mir
@@ -34,22 +34,22 @@ body:             |
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
   ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr16
-  ; GFX9-NEXT:   undef %18.sub15:vreg_512 = COPY $vgpr15
-  ; GFX9-NEXT:   %18.sub14:vreg_512 = COPY $vgpr14
-  ; GFX9-NEXT:   %18.sub13:vreg_512 = COPY $vgpr13
-  ; GFX9-NEXT:   %18.sub12:vreg_512 = COPY $vgpr12
-  ; GFX9-NEXT:   %18.sub11:vreg_512 = COPY $vgpr11
-  ; GFX9-NEXT:   %18.sub10:vreg_512 = COPY $vgpr10
-  ; GFX9-NEXT:   %18.sub9:vreg_512 = COPY $vgpr9
-  ; GFX9-NEXT:   %18.sub8:vreg_512 = COPY $vgpr8
-  ; GFX9-NEXT:   %18.sub7:vreg_512 = COPY $vgpr7
-  ; GFX9-NEXT:   %18.sub6:vreg_512 = COPY $vgpr6
-  ; GFX9-NEXT:   %18.sub5:vreg_512 = COPY $vgpr5
-  ; GFX9-NEXT:   %18.sub4:vreg_512 = COPY $vgpr4
-  ; GFX9-NEXT:   %18.sub3:vreg_512 = COPY $vgpr3
-  ; GFX9-NEXT:   %18.sub2:vreg_512 = COPY $vgpr2
-  ; GFX9-NEXT:   %18.sub1:vreg_512 = COPY $vgpr1
-  ; GFX9-NEXT:   %18.sub0:vreg_512 = COPY $vgpr0
+  ; GFX9-NEXT:   undef [[COPY2:%[0-9]+]].sub15:vreg_512 = COPY $vgpr15
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub14:vreg_512 = COPY $vgpr14
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub13:vreg_512 = COPY $vgpr13
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub12:vreg_512 = COPY $vgpr12
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub11:vreg_512 = COPY $vgpr11
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub10:vreg_512 = COPY $vgpr10
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub9:vreg_512 = COPY $vgpr9
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub8:vreg_512 = COPY $vgpr8
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub7:vreg_512 = COPY $vgpr7
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub6:vreg_512 = COPY $vgpr6
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub5:vreg_512 = COPY $vgpr5
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub4:vreg_512 = COPY $vgpr4
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub3:vreg_512 = COPY $vgpr3
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub2:vreg_512 = COPY $vgpr2
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub1:vreg_512 = COPY $vgpr1
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub0:vreg_512 = COPY $vgpr0
   ; GFX9-NEXT:   [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 15, [[COPY1]], implicit $exec
   ; GFX9-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
   ; GFX9-NEXT: {{  $}}
@@ -60,7 +60,7 @@ body:             |
   ; GFX9-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[V_AND_B32_e32_]], implicit $exec
   ; GFX9-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def dead $scc, implicit $exec
   ; GFX9-NEXT:   S_SET_GPR_IDX_ON [[V_READFIRSTLANE_B32_]], 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-  ; GFX9-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 undef %18.sub0, implicit $exec, implicit %18, implicit $m0
+  ; GFX9-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 undef [[COPY2]].sub0, implicit $exec, implicit [[COPY2]], implicit $m0
   ; GFX9-NEXT:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   ; GFX9-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def dead $scc
   ; GFX9-NEXT:   S_CBRANCH_EXECNZ %bb.1, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/omod.ll b/llvm/test/CodeGen/AMDGPU/omod.ll
index fa1ca66ef415..769d035858ca 100644
--- a/llvm/test/CodeGen/AMDGPU/omod.ll
+++ b/llvm/test/CodeGen/AMDGPU/omod.ll
@@ -651,8 +651,8 @@ define amdgpu_ps void @v_omod_mul4_multi_use_f32(float %a) #0 {
 ; GFX12-NEXT:    v_add_f32_e32 v0, 1.0, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mul_f32_e32 v1, 4.0, v0
-; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b32 v[0:1], v1, off
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/opt-exec-masking-pre-ra-update-liveness-wave32.mir b/llvm/test/CodeGen/AMDGPU/opt-exec-masking-pre-ra-update-liveness-wave32.mir
index be1e36c05a30..564c018e9a95 100644
--- a/llvm/test/CodeGen/AMDGPU/opt-exec-masking-pre-ra-update-liveness-wave32.mir
+++ b/llvm/test/CodeGen/AMDGPU/opt-exec-masking-pre-ra-update-liveness-wave32.mir
@@ -19,13 +19,13 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (s128), align 8, addrspace 1)
-  ; CHECK-NEXT:   undef %2.sub1:sgpr_128 = S_MOV_B32 -1
-  ; CHECK-NEXT:   $vcc_lo = S_ANDN2_B32 $exec_lo, undef %2.sub0, implicit-def dead $scc
-  ; CHECK-NEXT:   %2.sub1:sgpr_128 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 -1
+  ; CHECK-NEXT:   $vcc_lo = S_ANDN2_B32 $exec_lo, undef [[S_MOV_B32_]].sub0, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   S_NOP 0, implicit %2.sub1
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]].sub1
   bb.0:
     liveins: $sgpr0_sgpr1
     %0:sgpr_64 = COPY $sgpr0_sgpr1
@@ -52,9 +52,9 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (s128), align 8, addrspace 1)
-  ; CHECK-NEXT:   undef %2.sub0:sreg_64_xexec = S_MOV_B32 -1
-  ; CHECK-NEXT:   $vcc_lo = S_ANDN2_B32 $exec_lo, %2.sub0, implicit-def dead $scc
-  ; CHECK-NEXT:   dead %2.sub1:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64_xexec = S_MOV_B32 -1
+  ; CHECK-NEXT:   $vcc_lo = S_ANDN2_B32 $exec_lo, [[S_MOV_B32_]].sub0, implicit-def dead $scc
+  ; CHECK-NEXT:   dead [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -83,13 +83,13 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (s128), align 8, addrspace 1)
-  ; CHECK-NEXT:   undef %2.sub0:sreg_64_xexec = S_MOV_B32 -1
-  ; CHECK-NEXT:   $vcc_lo = S_ANDN2_B32 $exec_lo, %2.sub0, implicit-def dead $scc
-  ; CHECK-NEXT:   %2.sub1:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64_xexec = S_MOV_B32 -1
+  ; CHECK-NEXT:   $vcc_lo = S_ANDN2_B32 $exec_lo, [[S_MOV_B32_]].sub0, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   S_NOP 0, implicit %2.sub1
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]].sub1
   bb.0:
     liveins: $sgpr0_sgpr1
     %0:sgpr_64 = COPY $sgpr0_sgpr1
@@ -116,13 +116,13 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (s128), align 8, addrspace 1)
-  ; CHECK-NEXT:   undef %2.sub0:sreg_64_xexec = S_MOV_B32 -1
-  ; CHECK-NEXT:   $vcc_lo = S_ANDN2_B32 $exec_lo, %2.sub0, implicit-def dead $scc
-  ; CHECK-NEXT:   %2.sub1:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64_xexec = S_MOV_B32 -1
+  ; CHECK-NEXT:   $vcc_lo = S_ANDN2_B32 $exec_lo, [[S_MOV_B32_]].sub0, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   S_NOP 0, implicit %2
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]]
   bb.0:
     liveins: $sgpr0_sgpr1
     %0:sgpr_64 = COPY $sgpr0_sgpr1
@@ -149,13 +149,13 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (s128), align 8, addrspace 1)
-  ; CHECK-NEXT:   undef %2.sub0:sreg_64_xexec = S_MOV_B32 -1
-  ; CHECK-NEXT:   $vcc_lo = S_ANDN2_B32 $exec_lo, %2.sub0, implicit-def dead $scc
-  ; CHECK-NEXT:   %2.sub1:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64_xexec = S_MOV_B32 -1
+  ; CHECK-NEXT:   $vcc_lo = S_ANDN2_B32 $exec_lo, [[S_MOV_B32_]].sub0, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   S_NOP 0, implicit %2.sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]].sub0
   bb.0:
     liveins: $sgpr0_sgpr1
     %0:sgpr_64 = COPY $sgpr0_sgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/opt-exec-masking-pre-ra-update-liveness.mir b/llvm/test/CodeGen/AMDGPU/opt-exec-masking-pre-ra-update-liveness.mir
index 897370b3d9b5..f224c91bad8c 100644
--- a/llvm/test/CodeGen/AMDGPU/opt-exec-masking-pre-ra-update-liveness.mir
+++ b/llvm/test/CodeGen/AMDGPU/opt-exec-masking-pre-ra-update-liveness.mir
@@ -272,7 +272,7 @@ body:             |
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[DEF]], implicit $exec
+  ; CHECK-NEXT:   dead [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[DEF]], implicit $exec
   ; CHECK-NEXT:   $vcc = S_ANDN2_B64 $exec, [[DEF]], implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.4, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.3
@@ -280,7 +280,7 @@ body:             |
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_NOP 0, implicit undef %1
+  ; CHECK-NEXT:   S_NOP 0, implicit undef [[V_CNDMASK_B32_e64_]]
   ; CHECK-NEXT:   S_BRANCH %bb.4
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
@@ -314,7 +314,7 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead %0:sreg_64_xexec = S_MOV_B64 0
+  ; CHECK-NEXT:   dead [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.2, implicit undef $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -326,7 +326,7 @@ body:             |
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vcc = S_ANDN2_B64 $exec, undef %0, implicit-def $scc
+  ; CHECK-NEXT:   $vcc = S_ANDN2_B64 $exec, undef [[S_MOV_B64_]], implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.4, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -374,7 +374,7 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_]], -1, implicit-def $scc
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = S_ADD_I32 [[DEF]], -1, implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_SCC0 %bb.4, implicit $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
@@ -440,7 +440,7 @@ body:             |
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_]], -1, implicit-def $scc
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = S_ADD_I32 [[DEF]], -1, implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
@@ -645,9 +645,9 @@ body:             |
   ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr14, $sgpr15, $sgpr16
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef %1.sub1:vreg_64 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[DEF]], implicit $exec
-  ; CHECK-NEXT:   [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 0, %1.sub1, implicit $exec
-  ; CHECK-NEXT:   %1.sub0:vreg_64 = V_MOV_B32_e32 123, implicit $exec
+  ; CHECK-NEXT:   undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub1:vreg_64 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[DEF]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 0, [[V_CNDMASK_B32_e64_]].sub1, implicit $exec
+  ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 123, implicit $exec
   ; CHECK-NEXT:   $exec = S_MOV_B64_term [[V_CMP_GT_I32_e64_]]
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
   ; CHECK-NEXT: {{  $}}
@@ -660,7 +660,7 @@ body:             |
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_NOP 0, implicit %1.sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit [[V_CNDMASK_B32_e64_]].sub0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-exec-mask-pre-ra-loop-phi.mir b/llvm/test/CodeGen/AMDGPU/optimize-exec-mask-pre-ra-loop-phi.mir
index 84f9871527da..c4d0583a3317 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-exec-mask-pre-ra-loop-phi.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-exec-mask-pre-ra-loop-phi.mir
@@ -12,7 +12,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
-  ; CHECK-NEXT:   undef %1.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -23,10 +23,10 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_MOV_B64_]], implicit $exec
   ; CHECK-NEXT:   V_CMP_NE_U32_e32 1, [[V_CNDMASK_B32_e64_]], implicit-def $vcc, implicit $exec
-  ; CHECK-NEXT:   %1.sub1:vreg_64 = COPY %1.sub0
-  ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %3:vgpr_32, %1, 0, 0, implicit $exec :: (store (s64), addrspace 3)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_]].sub0
+  ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %3:vgpr_32, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (store (s64), addrspace 3)
   ; CHECK-NEXT:   ATOMIC_FENCE 4, 2
-  ; CHECK-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
+  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
   ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.2
@@ -65,7 +65,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
-  ; CHECK-NEXT:   undef %1.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -74,11 +74,11 @@ body:             |
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   %1.sub1:vreg_64 = COPY %1.sub0
-  ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %3:vgpr_32, %1, 0, 0, implicit $exec :: (store (s64), addrspace 3)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_]].sub0
+  ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %3:vgpr_32, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (store (s64), addrspace 3)
   ; CHECK-NEXT:   ATOMIC_FENCE 4, 2
   ; CHECK-NEXT:   $vcc = S_ANDN2_B64 $exec, [[S_MOV_B64_]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
+  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.2
   bb.0:
@@ -115,7 +115,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 -1
-  ; CHECK-NEXT:   undef %0.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -127,8 +127,8 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr4_sgpr5, implicit $exec
   ; CHECK-NEXT:   V_CMP_NE_U32_e32 1, [[V_CNDMASK_B32_e64_]], implicit-def $vcc, implicit $exec
-  ; CHECK-NEXT:   %0.sub1:vreg_64 = COPY %0.sub0
-  ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %2:vgpr_32, %0, 0, 0, implicit $exec :: (store (s64), addrspace 3)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_]].sub0
+  ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %2:vgpr_32, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (store (s64), addrspace 3)
   ; CHECK-NEXT:   ATOMIC_FENCE 4, 2
   ; CHECK-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 0
   ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc
@@ -169,7 +169,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 -1
-  ; CHECK-NEXT:   undef %0.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -179,8 +179,8 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   %0.sub1:vreg_64 = COPY %0.sub0
-  ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %2:vgpr_32, %0, 0, 0, implicit $exec :: (store (s64), addrspace 3)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_]].sub0
+  ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %2:vgpr_32, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (store (s64), addrspace 3)
   ; CHECK-NEXT:   ATOMIC_FENCE 4, 2
   ; CHECK-NEXT:   $vcc = S_ANDN2_B64 $exec, $sgpr4_sgpr5, implicit-def dead $scc
   ; CHECK-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 0
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir b/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir
index 6170fe63f765..6be3e592eee4 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir
@@ -139,8 +139,8 @@ body:             |
   ; GCN: bb.0:
   ; GCN-NEXT:   successors: %bb.1(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   dead undef %0.sub0:sgpr_256 = COPY $exec
-  ; GCN-NEXT:   dead %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, undef %2:sreg_64_xexec, implicit $exec
+  ; GCN-NEXT:   dead undef [[COPY:%[0-9]+]].sub0:sgpr_256 = COPY $exec
+  ; GCN-NEXT:   dead [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, undef %2:sreg_64_xexec, implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.1
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/partial-forwarding-hazards.mir b/llvm/test/CodeGen/AMDGPU/partial-forwarding-hazards.mir
index d9333edb03a9..5659e70cb7db 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-forwarding-hazards.mir
+++ b/llvm/test/CodeGen/AMDGPU/partial-forwarding-hazards.mir
@@ -13,6 +13,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 4095
     ; GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: partial_forwarding_1_hazard
     ; GFX12: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     ; GFX12-NEXT: $exec = S_MOV_B64 -1
@@ -48,6 +49,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 4095
     ; GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: partial_forwarding_2_hazard
     ; GFX12: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     ; GFX12-NEXT: $sgpr0 = S_MOV_B32 0
@@ -100,6 +102,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 4095
     ; GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: partial_forwarding_3_hazard
     ; GFX12: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     ; GFX12-NEXT: $vgpr10 = V_MOV_B32_e32 0, implicit $exec
@@ -235,6 +238,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 4095
     ; GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: partial_forwarding_4_hazard
     ; GFX12: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     ; GFX12-NEXT: $exec = S_MOV_B64 -1
@@ -308,6 +312,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 4095
     ; GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: partial_forwarding_5_hazard
     ; GFX12: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     ; GFX12-NEXT: $vgpr10 = V_MOV_B32_e32 0, implicit $exec
@@ -394,6 +399,7 @@ body:            |
   ; GFX11-NEXT:   S_WAITCNT_DEPCTR 4095
   ; GFX11-NEXT:   $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
   ; GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX12-LABEL: name: partial_forwarding_branching_1a
   ; GFX12: bb.0:
   ; GFX12-NEXT:   successors: %bb.2(0x80000000)
@@ -471,6 +477,7 @@ body:            |
   ; GFX11-NEXT:   S_WAITCNT_DEPCTR 4095
   ; GFX11-NEXT:   $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
   ; GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX12-LABEL: name: partial_forwarding_branching_1b
   ; GFX12: bb.0:
   ; GFX12-NEXT:   successors: %bb.2(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/pei-build-av-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-build-av-spill.mir
index a9580da5af1d..9ce2464e0968 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-build-av-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-build-av-spill.mir
@@ -24,6 +24,7 @@ body:             |
     ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_v1
     ; MUBUF-V2A: liveins: $agpr0
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -31,11 +32,13 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_v1
     ; FLATSCR: $vgpr0 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_v1
     ; FLATSCR-V2A: liveins: $agpr0
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -43,11 +46,13 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; FLATSCR-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_v1
     ; MUBUF-GFX90A: $vgpr0 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_v1
     ; MUBUF-GFX90A-V2A: liveins: $agpr0
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -55,11 +60,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_v1
     ; FLATSCR-GFX90A: $vgpr0 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_v1
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -92,6 +99,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_v2
     ; MUBUF-V2A: liveins: $agpr0, $agpr1
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -101,11 +109,13 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1
     ; MUBUF-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_v2
     ; FLATSCR: $vgpr0_vgpr1 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_v2
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -115,6 +125,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1
     ; FLATSCR-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $vgpr0_vgpr1
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_v2
     ; MUBUF-GFX90A: $vgpr0_vgpr1 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
@@ -122,6 +133,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_v2
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -131,11 +143,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_v2
     ; FLATSCR-GFX90A: $vgpr0_vgpr1 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_v2
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -172,6 +186,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_v3
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -183,11 +198,13 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_v3
     ; FLATSCR: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr0_vgpr1_vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr0_vgpr1_vgpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s96) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_v3
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -199,6 +216,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
     ; FLATSCR-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_v3
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr0_vgpr1_vgpr2 :: (store (s32) into %stack.0, addrspace 5)
@@ -208,6 +226,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_v3
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -219,11 +238,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_v3
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr0_vgpr1_vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1_vgpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s96) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_v3
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -264,6 +285,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_v4
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -277,11 +299,13 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_v4
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_v4
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -295,6 +319,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; FLATSCR-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_v4
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5)
@@ -306,6 +331,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_v4
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -319,11 +345,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_v4
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_v4
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -368,6 +396,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_v5
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -383,6 +412,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_v5
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -390,6 +420,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_v5
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -405,6 +436,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; FLATSCR-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_v5
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store (s32) into %stack.0, addrspace 5)
@@ -418,6 +450,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_v5
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -433,6 +466,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_v5
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -440,6 +474,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_v5
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -488,6 +523,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_v6
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -505,6 +541,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_v6
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -512,6 +549,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr4_vgpr5 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s64) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_v6
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -529,6 +567,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; FLATSCR-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_v6
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store (s32) into %stack.0, addrspace 5)
@@ -544,6 +583,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_v6
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -561,6 +601,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_v6
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -568,6 +609,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr4_vgpr5 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s64) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_v6
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -620,6 +662,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; MUBUF-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 :: (load (s32) from %stack.0 + 24, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_v7
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -639,6 +682,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_v7
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -646,6 +690,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr4_vgpr5_vgpr6 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 :: (load (s96) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_v7
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -665,6 +710,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; FLATSCR-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_v7
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 :: (store (s32) into %stack.0, addrspace 5)
@@ -682,6 +728,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 :: (load (s32) from %stack.0 + 24, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_v7
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -701,6 +748,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_v7
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -708,6 +756,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr4_vgpr5_vgpr6 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 :: (load (s96) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_v7
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -764,6 +813,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 24, addrspace 5)
     ; MUBUF-NEXT: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load (s32) from %stack.0 + 28, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_v8
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -785,6 +835,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_v8
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -792,6 +843,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load (s128) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_v8
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -813,6 +865,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; FLATSCR-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_v8
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store (s32) into %stack.0, addrspace 5)
@@ -832,6 +885,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 24, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load (s32) from %stack.0 + 28, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_v8
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -853,6 +907,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_v8
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -860,6 +915,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load (s128) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_v8
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -934,6 +990,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 56, addrspace 5)
     ; MUBUF-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load (s32) from %stack.0 + 60, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_v16
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -971,6 +1028,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_v16
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -982,6 +1040,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr8_vgpr9_vgpr10_vgpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 32, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr12_vgpr13_vgpr14_vgpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load (s128) from %stack.0 + 48, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_v16
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -1019,6 +1078,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; FLATSCR-V2A-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_v16
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store (s32) into %stack.0, addrspace 5)
@@ -1054,6 +1114,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 56, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load (s32) from %stack.0 + 60, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_v16
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -1091,6 +1152,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_v16
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -1102,6 +1164,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $vgpr8_vgpr9_vgpr10_vgpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 32, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr12_vgpr13_vgpr14_vgpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load (s128) from %stack.0 + 48, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_v16
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -1224,6 +1287,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 120, addrspace 5)
     ; MUBUF-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load (s32) from %stack.0 + 124, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_v32
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29, $agpr30, $agpr31
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -1293,6 +1357,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr30 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr31 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_v32
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -1312,6 +1377,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr24_vgpr25_vgpr26_vgpr27 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 96, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 96, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr28_vgpr29_vgpr30_vgpr31 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 112, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load (s128) from %stack.0 + 112, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_v32
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29, $agpr30, $agpr31
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -1381,6 +1447,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr29 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
     ; FLATSCR-V2A-NEXT: $vgpr28 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_v32
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store (s32) into %stack.0, addrspace 5)
@@ -1448,6 +1515,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 120, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load (s32) from %stack.0 + 124, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_v32
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29, $agpr30, $agpr31
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -1517,6 +1585,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr30 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr31 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_v32
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -1536,6 +1605,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $vgpr24_vgpr25_vgpr26_vgpr27 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 96, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 96, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr28_vgpr29_vgpr30_vgpr31 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 112, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load (s128) from %stack.0 + 112, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_v32
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29, $agpr30, $agpr31
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -1630,6 +1700,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a1
     ; MUBUF-V2A: liveins: $vgpr0
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -1637,6 +1708,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a1
     ; FLATSCR: $agpr0 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
@@ -1644,6 +1716,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
     ; FLATSCR-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a1
     ; FLATSCR-V2A: liveins: $vgpr0
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -1651,11 +1724,13 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a1
     ; MUBUF-GFX90A: $agpr0 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a1
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -1663,11 +1738,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a1
     ; FLATSCR-GFX90A: $agpr0 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a1
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -1704,6 +1781,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a2
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -1713,6 +1791,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1
     ; MUBUF-V2A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a2
     ; FLATSCR: $agpr0_agpr1 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1
@@ -1724,6 +1803,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; FLATSCR-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a2
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -1733,6 +1813,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1
     ; FLATSCR-V2A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a2
     ; MUBUF-GFX90A: $agpr0_agpr1 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1, implicit $agpr0_agpr1 :: (store (s32) into %stack.0, addrspace 5)
@@ -1740,6 +1821,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1 :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec, implicit-def $agpr0_agpr1 :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a2
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -1749,11 +1831,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1
     ; MUBUF-GFX90A-V2A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a2
     ; FLATSCR-GFX90A: $agpr0_agpr1 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $agpr0_agpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a2
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -1796,6 +1880,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a3
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -1807,6 +1892,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a3
     ; FLATSCR: $agpr0_agpr1_agpr2 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
@@ -1822,6 +1908,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; FLATSCR-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a3
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -1833,6 +1920,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a3
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr0_agpr1_agpr2 :: (store (s32) into %stack.0, addrspace 5)
@@ -1842,6 +1930,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a3
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -1853,11 +1942,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a3
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $agpr0_agpr1_agpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1_agpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s96) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a3
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -1906,6 +1997,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a4
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -1919,6 +2011,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a4
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
@@ -1938,6 +2031,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; FLATSCR-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a4
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -1951,6 +2045,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a4
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3 :: (store (s32) into %stack.0, addrspace 5)
@@ -1962,6 +2057,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a4
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -1975,11 +2071,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a4
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a4
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -2034,6 +2132,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a5
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -2049,6 +2148,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a5
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
@@ -2072,6 +2172,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; FLATSCR-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a5
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -2087,6 +2188,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a5
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store (s32) into %stack.0, addrspace 5)
@@ -2100,6 +2202,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a5
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -2115,6 +2218,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a5
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -2122,6 +2226,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a5
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -2182,6 +2287,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; MUBUF-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a6
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -2199,6 +2305,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a6
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
@@ -2226,6 +2333,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; FLATSCR-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a6
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -2243,6 +2351,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a6
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store (s32) into %stack.0, addrspace 5)
@@ -2258,6 +2367,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a6
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -2275,6 +2385,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a6
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -2282,6 +2393,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr4_agpr5 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load (s64) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a6
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -2348,6 +2460,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 24, addrspace 5)
     ; MUBUF-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a7
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -2367,6 +2480,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a7
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6
@@ -2398,6 +2512,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 24, addrspace 5)
     ; FLATSCR-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a7
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -2417,6 +2532,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a7
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6 :: (store (s32) into %stack.0, addrspace 5)
@@ -2434,6 +2550,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6 :: (load (s32) from %stack.0 + 24, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a7
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -2453,6 +2570,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a7
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -2460,6 +2578,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr4_agpr5_agpr6 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6 :: (load (s96) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a7
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -2532,6 +2651,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 28, addrspace 5)
     ; MUBUF-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a8
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -2553,6 +2673,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a8
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
@@ -2588,6 +2709,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 28, addrspace 5)
     ; FLATSCR-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a8
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -2609,6 +2731,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a8
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store (s32) into %stack.0, addrspace 5)
@@ -2628,6 +2751,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 24, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load (s32) from %stack.0 + 28, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a8
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -2649,6 +2773,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a8
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -2656,6 +2781,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr4_agpr5_agpr6_agpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load (s128) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a8
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -2734,6 +2860,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 32, addrspace 5)
     ; MUBUF-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a9
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -2757,6 +2884,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a9
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
@@ -2796,6 +2924,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 32, addrspace 5)
     ; FLATSCR-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a9
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -2819,6 +2948,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a9
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 :: (store (s32) into %stack.0, addrspace 5)
@@ -2840,6 +2970,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 28, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 :: (load (s32) from %stack.0 + 32, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a9
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -2863,6 +2994,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a9
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -2872,6 +3004,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr4_agpr5_agpr6_agpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 :: (load (s32) from %stack.0 + 32, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a9
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -2956,6 +3089,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 36, addrspace 5)
     ; MUBUF-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a10
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -2981,6 +3115,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a10
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
@@ -3024,6 +3159,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 36, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 36, addrspace 5)
     ; FLATSCR-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a10
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -3049,6 +3185,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a10
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 :: (store (s32) into %stack.0, addrspace 5)
@@ -3072,6 +3209,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 32, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 :: (load (s32) from %stack.0 + 36, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a10
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -3097,6 +3235,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a10
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -3106,6 +3245,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr4_agpr5_agpr6_agpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr8_agpr9 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 :: (load (s64) from %stack.0 + 32, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a10
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -3196,6 +3336,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 40, addrspace 5)
     ; MUBUF-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a11
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -3223,6 +3364,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a11
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
@@ -3270,6 +3412,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 40, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 40, addrspace 5)
     ; FLATSCR-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a11
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -3297,6 +3440,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a11
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 :: (store (s32) into %stack.0, addrspace 5)
@@ -3322,6 +3466,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 36, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 :: (load (s32) from %stack.0 + 40, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a11
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -3349,6 +3494,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a11
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -3358,6 +3504,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr4_agpr5_agpr6_agpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr8_agpr9_agpr10 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 :: (load (s96) from %stack.0 + 32, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a11
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -3454,6 +3601,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 44, addrspace 5)
     ; MUBUF-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a12
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -3483,6 +3631,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a12
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
@@ -3534,6 +3683,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 44, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 44, addrspace 5)
     ; FLATSCR-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a12
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -3563,6 +3713,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a12
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 :: (store (s32) into %stack.0, addrspace 5)
@@ -3590,6 +3741,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 40, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 :: (load (s32) from %stack.0 + 44, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a12
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -3619,6 +3771,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a12
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -3628,6 +3781,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr4_agpr5_agpr6_agpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr8_agpr9_agpr10_agpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 :: (load (s128) from %stack.0 + 32, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a12
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -3742,6 +3896,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 60, addrspace 5)
     ; MUBUF-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a16
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -3779,6 +3934,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a16
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
@@ -3846,6 +4002,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 60, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 60, addrspace 5)
     ; FLATSCR-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a16
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -3883,6 +4040,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a16
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store (s32) into %stack.0, addrspace 5)
@@ -3918,6 +4076,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 56, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load (s32) from %stack.0 + 60, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a16
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -3955,6 +4114,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a16
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -3966,6 +4126,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr8_agpr9_agpr10_agpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 32, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr12_agpr13_agpr14_agpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load (s128) from %stack.0 + 48, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a16
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -4152,6 +4313,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 124, addrspace 5)
     ; MUBUF-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a32
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -4221,6 +4383,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a32
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
@@ -4352,6 +4515,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 124, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 124, addrspace 5)
     ; FLATSCR-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a32
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -4421,6 +4585,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a32
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store (s32) into %stack.0, addrspace 5)
@@ -4488,6 +4653,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 120, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load (s32) from %stack.0 + 124, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a32
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -4557,6 +4723,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a32
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -4576,6 +4743,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr24_agpr25_agpr26_agpr27 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 96, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 96, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr28_agpr29_agpr30_agpr31 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 112, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load (s128) from %stack.0 + 112, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a32
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir b/llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir
index a3d78f342c25..8eddc9a5afd5 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir
@@ -65,6 +65,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v2_partial_agpr
     ; FLATSCR-V2A: liveins: $agpr0
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -103,6 +104,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v3_partial_agpr
     ; FLATSCR-V2A: liveins: $agpr0
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -143,6 +145,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v4_partial_agpr
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -189,6 +192,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v5_partial_agpr
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -237,6 +241,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v6_partial_agpr
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -293,6 +298,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v8_partial_agpr
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -363,6 +369,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v16_partial_agpr
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
     ; FLATSCR-V2A-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/pei-build-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-build-spill.mir
index 9024a395b6cf..d74b6f239b35 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-build-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-build-spill.mir
@@ -24,6 +24,7 @@ body:             |
     ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_v1
     ; MUBUF-V2A: liveins: $agpr0
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -31,11 +32,13 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_v1
     ; FLATSCR: $vgpr0 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v1
     ; FLATSCR-V2A: liveins: $agpr0
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -43,11 +46,13 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; FLATSCR-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_v1
     ; MUBUF-GFX90A: $vgpr0 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v1
     ; MUBUF-GFX90A-V2A: liveins: $agpr0
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -55,11 +60,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_v1
     ; FLATSCR-GFX90A: $vgpr0 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v1
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -92,6 +99,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_v2
     ; MUBUF-V2A: liveins: $agpr0, $agpr1
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -101,11 +109,13 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1
     ; MUBUF-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_v2
     ; FLATSCR: $vgpr0_vgpr1 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v2
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -115,6 +125,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1
     ; FLATSCR-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $vgpr0_vgpr1
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_v2
     ; MUBUF-GFX90A: $vgpr0_vgpr1 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
@@ -122,6 +133,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v2
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -131,11 +143,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_v2
     ; FLATSCR-GFX90A: $vgpr0_vgpr1 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v2
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -172,6 +186,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_v3
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -183,11 +198,13 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_v3
     ; FLATSCR: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr0_vgpr1_vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr0_vgpr1_vgpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s96) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v3
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -199,6 +216,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
     ; FLATSCR-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_v3
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr0_vgpr1_vgpr2 :: (store (s32) into %stack.0, addrspace 5)
@@ -208,6 +226,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v3
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -219,11 +238,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_v3
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr0_vgpr1_vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1_vgpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s96) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v3
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -264,6 +285,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_v4
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -277,11 +299,13 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_v4
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v4
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -295,6 +319,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; FLATSCR-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_v4
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5)
@@ -306,6 +331,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v4
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -319,11 +345,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_v4
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v4
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -368,6 +396,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_v5
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -383,6 +412,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_v5
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -390,6 +420,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v5
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -405,6 +436,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; FLATSCR-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_v5
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store (s32) into %stack.0, addrspace 5)
@@ -418,6 +450,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v5
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -433,6 +466,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_v5
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -440,6 +474,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v5
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -488,6 +523,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_v6
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -505,6 +541,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_v6
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -512,6 +549,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr4_vgpr5 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s64) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v6
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -529,6 +567,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; FLATSCR-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_v6
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store (s32) into %stack.0, addrspace 5)
@@ -544,6 +583,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v6
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -561,6 +601,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_v6
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -568,6 +609,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr4_vgpr5 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s64) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v6
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -622,6 +664,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 24, addrspace 5)
     ; MUBUF-NEXT: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load (s32) from %stack.0 + 28, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_v8
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -643,6 +686,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_v8
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -650,6 +694,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load (s128) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v8
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -671,6 +716,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; FLATSCR-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_v8
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store (s32) into %stack.0, addrspace 5)
@@ -690,6 +736,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 24, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load (s32) from %stack.0 + 28, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v8
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -711,6 +758,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_v8
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -718,6 +766,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load (s128) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v8
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -792,6 +841,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 56, addrspace 5)
     ; MUBUF-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load (s32) from %stack.0 + 60, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_v16
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -829,6 +879,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_v16
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -840,6 +891,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr8_vgpr9_vgpr10_vgpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 32, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr12_vgpr13_vgpr14_vgpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load (s128) from %stack.0 + 48, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v16
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -877,6 +929,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; FLATSCR-V2A-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_v16
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store (s32) into %stack.0, addrspace 5)
@@ -912,6 +965,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 56, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load (s32) from %stack.0 + 60, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v16
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -949,6 +1003,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_v16
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -960,6 +1015,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $vgpr8_vgpr9_vgpr10_vgpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 32, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr12_vgpr13_vgpr14_vgpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load (s128) from %stack.0 + 48, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v16
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -1082,6 +1138,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 120, addrspace 5)
     ; MUBUF-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load (s32) from %stack.0 + 124, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_v32
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29, $agpr30, $agpr31
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -1151,6 +1208,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr30 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr31 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_v32
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -1170,6 +1228,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr24_vgpr25_vgpr26_vgpr27 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 96, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 96, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr28_vgpr29_vgpr30_vgpr31 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 112, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load (s128) from %stack.0 + 112, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v32
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29, $agpr30, $agpr31
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -1239,6 +1298,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr29 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
     ; FLATSCR-V2A-NEXT: $vgpr28 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_v32
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store (s32) into %stack.0, addrspace 5)
@@ -1306,6 +1366,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 120, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load (s32) from %stack.0 + 124, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v32
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29, $agpr30, $agpr31
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -1375,6 +1436,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr30 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr31 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_v32
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -1394,6 +1456,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $vgpr24_vgpr25_vgpr26_vgpr27 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 96, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 96, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr28_vgpr29_vgpr30_vgpr31 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 112, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load (s128) from %stack.0 + 112, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v32
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29, $agpr30, $agpr31
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -1488,6 +1551,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_a1
     ; MUBUF-V2A: liveins: $vgpr0
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -1495,6 +1559,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_a1
     ; FLATSCR: $agpr0 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
@@ -1502,6 +1567,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
     ; FLATSCR-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_a1
     ; FLATSCR-V2A: liveins: $vgpr0
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -1509,11 +1575,13 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_a1
     ; MUBUF-GFX90A: $agpr0 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a1
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -1521,11 +1589,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_a1
     ; FLATSCR-GFX90A: $agpr0 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a1
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -1562,6 +1632,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_a2
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -1571,6 +1642,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1
     ; MUBUF-V2A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_a2
     ; FLATSCR: $agpr0_agpr1 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1
@@ -1582,6 +1654,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; FLATSCR-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_a2
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -1591,6 +1664,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1
     ; FLATSCR-V2A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_a2
     ; MUBUF-GFX90A: $agpr0_agpr1 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1, implicit $agpr0_agpr1 :: (store (s32) into %stack.0, addrspace 5)
@@ -1598,6 +1672,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1 :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec, implicit-def $agpr0_agpr1 :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a2
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -1607,11 +1682,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1
     ; MUBUF-GFX90A-V2A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_a2
     ; FLATSCR-GFX90A: $agpr0_agpr1 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $agpr0_agpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a2
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -1654,6 +1731,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_a3
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -1665,6 +1743,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_a3
     ; FLATSCR: $agpr0_agpr1_agpr2 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
@@ -1680,6 +1759,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; FLATSCR-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_a3
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -1691,6 +1771,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_a3
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr0_agpr1_agpr2 :: (store (s32) into %stack.0, addrspace 5)
@@ -1700,6 +1781,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a3
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -1711,11 +1793,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_a3
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $agpr0_agpr1_agpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1_agpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s96) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a3
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -1764,6 +1848,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_a4
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -1777,6 +1862,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_a4
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
@@ -1796,6 +1882,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; FLATSCR-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_a4
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -1809,6 +1896,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_a4
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3 :: (store (s32) into %stack.0, addrspace 5)
@@ -1820,6 +1908,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a4
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -1833,11 +1922,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_a4
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a4
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -1892,6 +1983,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_a5
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -1907,6 +1999,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_a5
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
@@ -1930,6 +2023,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; FLATSCR-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_a5
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -1945,6 +2039,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_a5
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store (s32) into %stack.0, addrspace 5)
@@ -1958,6 +2053,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a5
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -1973,6 +2069,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_a5
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -1980,6 +2077,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a5
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -2040,6 +2138,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; MUBUF-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_a6
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -2057,6 +2156,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_a6
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
@@ -2084,6 +2184,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; FLATSCR-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_a6
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -2101,6 +2202,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_a6
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store (s32) into %stack.0, addrspace 5)
@@ -2116,6 +2218,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a6
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -2133,6 +2236,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_a6
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -2140,6 +2244,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr4_agpr5 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load (s64) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a6
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -2210,6 +2315,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 28, addrspace 5)
     ; MUBUF-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_a8
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -2231,6 +2337,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_a8
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
@@ -2266,6 +2373,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 28, addrspace 5)
     ; FLATSCR-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_a8
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -2287,6 +2395,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_a8
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store (s32) into %stack.0, addrspace 5)
@@ -2306,6 +2415,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 24, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load (s32) from %stack.0 + 28, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a8
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -2327,6 +2437,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_a8
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -2334,6 +2445,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr4_agpr5_agpr6_agpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load (s128) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a8
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -2440,6 +2552,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 60, addrspace 5)
     ; MUBUF-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_a16
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -2477,6 +2590,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_a16
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
@@ -2544,6 +2658,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 60, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 60, addrspace 5)
     ; FLATSCR-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_a16
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -2581,6 +2696,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_a16
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store (s32) into %stack.0, addrspace 5)
@@ -2616,6 +2732,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 56, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load (s32) from %stack.0 + 60, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a16
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -2653,6 +2770,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_a16
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -2664,6 +2782,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr8_agpr9_agpr10_agpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 32, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr12_agpr13_agpr14_agpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load (s128) from %stack.0 + 48, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a16
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -2850,6 +2969,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 124, addrspace 5)
     ; MUBUF-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_a32
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -2919,6 +3039,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_a32
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
@@ -3050,6 +3171,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 124, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 124, addrspace 5)
     ; FLATSCR-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_a32
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -3119,6 +3241,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_a32
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store (s32) into %stack.0, addrspace 5)
@@ -3186,6 +3309,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 120, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load (s32) from %stack.0 + 124, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a32
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -3255,6 +3379,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_a32
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -3274,6 +3399,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr24_agpr25_agpr26_agpr27 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 96, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 96, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr28_agpr29_agpr30_agpr31 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 112, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load (s128) from %stack.0 + 112, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a32
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
index e72ed2ba99e1..2ccc24152a9f 100644
--- a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
+++ b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
@@ -96,7 +96,7 @@ body:             |
   ; CHECK-NEXT:   renamable $sgpr93 = COPY renamable $sgpr60
   ; CHECK-NEXT:   renamable $sgpr94 = COPY renamable $sgpr60
   ; CHECK-NEXT:   renamable $sgpr95 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_1024_align2 = COPY killed renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY killed renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit $exec
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.11, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-fast-dont-drop-subreg-index-issue61134.mir b/llvm/test/CodeGen/AMDGPU/regalloc-fast-dont-drop-subreg-index-issue61134.mir
index 86f6bfbe16c6..fe2b4824f35a 100644
--- a/llvm/test/CodeGen/AMDGPU/regalloc-fast-dont-drop-subreg-index-issue61134.mir
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-fast-dont-drop-subreg-index-issue61134.mir
@@ -19,14 +19,14 @@ body:             |
     ; CHECK-LABEL: name: func
     ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: undef %0.sub0:vreg_64 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
-    ; CHECK-NEXT: undef %2.sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
-    ; CHECK-NEXT: undef %0.sub0:vreg_64, renamable $sgpr0_sgpr1 = V_ADD_CO_U32_e64 1, %0.sub0, 0, implicit $exec
-    ; CHECK-NEXT: undef %2.sub1:vreg_64, dead renamable $sgpr0_sgpr1 = V_ADDC_U32_e64 0, %2.sub1, killed $sgpr0_sgpr1, 0, implicit $exec
-    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
+    ; CHECK-NEXT: undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: undef [[COPY]].sub0:vreg_64, renamable $sgpr0_sgpr1 = V_ADD_CO_U32_e64 1, [[COPY]].sub0, 0, implicit $exec
+    ; CHECK-NEXT: undef [[V_MOV_B32_e32_]].sub1:vreg_64, dead renamable $sgpr0_sgpr1 = V_ADDC_U32_e64 0, [[V_MOV_B32_e32_]].sub1, killed $sgpr0_sgpr1, 0, implicit $exec
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY1]], 0, 0, implicit $exec
     ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $exec
-    ; CHECK-NEXT: S_ENDPGM 0, implicit %0, implicit %2, implicit $vgpr0
+    ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY]], implicit [[V_MOV_B32_e32_]], implicit $vgpr0
     undef %0.sub0:vreg_64 = COPY $vgpr0
     %2:vreg_64 = COPY $vgpr1_vgpr2
     undef %1.sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/regcoalesce-cannot-join-failures.mir b/llvm/test/CodeGen/AMDGPU/regcoalesce-cannot-join-failures.mir
index 9fec776df13f..6c556433088c 100644
--- a/llvm/test/CodeGen/AMDGPU/regcoalesce-cannot-join-failures.mir
+++ b/llvm/test/CodeGen/AMDGPU/regcoalesce-cannot-join-failures.mir
@@ -9,16 +9,16 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:sreg_64_xexec = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[DEF:%[0-9]+]].sub0:sreg_64_xexec = IMPLICIT_DEF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   %0.sub1:sreg_64_xexec = COPY %0.sub0
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]].sub1:sreg_64_xexec = COPY [[DEF]].sub0
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   S_ENDPGM 0, implicit %0
+  ; CHECK-NEXT:   S_ENDPGM 0, implicit [[DEF]]
   bb.0:
     successors: %bb.1
 
@@ -42,9 +42,9 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; CHECK-LABEL: name: couldnt_join_subrange_no_implicit_def_inst
-    ; CHECK: undef %0.sub0:sreg_64 = S_MOV_B32 0
-    ; CHECK-NEXT: %0.sub1:sreg_64 = COPY %0.sub0
-    ; CHECK-NEXT: S_ENDPGM 0, implicit %0.sub1
+    ; CHECK: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_MOV_B32 0
+    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = COPY [[S_MOV_B32_]].sub0
+    ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_MOV_B32_]].sub1
     undef %0.sub0:sreg_64 = S_MOV_B32 0
     %1:sreg_64 = COPY %0:sreg_64
     %0.sub1:sreg_64 = COPY %0.sub0:sreg_64
@@ -59,12 +59,12 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:sreg_64 = S_MOV_B32 -1
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 -1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   %0.sub0:sreg_64 = S_MOV_B32 0
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY %0
-  ; CHECK-NEXT:   dead %0.sub1:sreg_64 = COPY %0.sub0
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY [[S_MOV_B32_]]
+  ; CHECK-NEXT:   dead [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = COPY [[S_MOV_B32_]].sub0
   ; CHECK-NEXT:   S_ENDPGM 0, implicit [[COPY]].sub1
   bb.0:
     successors: %bb.1
@@ -84,10 +84,10 @@ body:             |
   bb.0:
 
     ; CHECK-LABEL: name: lanes_not_tracked_subreg_join_couldnt_join_subrange
-    ; CHECK: undef %0.sub0:sreg_64_xexec = S_MOV_B32 0
-    ; CHECK-NEXT: %0.sub1:sreg_64_xexec = S_MOV_B32 0
-    ; CHECK-NEXT: S_NOP 0, implicit %0.sub1
-    ; CHECK-NEXT: S_NOP 0, implicit %0
+    ; CHECK: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64_xexec = S_MOV_B32 0
+    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64_xexec = S_MOV_B32 0
+    ; CHECK-NEXT: S_NOP 0, implicit [[S_MOV_B32_]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit [[S_MOV_B32_]]
     ; CHECK-NEXT: S_ENDPGM 0
     undef %0.sub0:sreg_64_xexec = S_MOV_B32 0
     %1:sreg_64 = COPY %0
@@ -105,12 +105,12 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:sreg_64_xexec = S_MOV_B32 0
-  ; CHECK-NEXT:   %0.sub1:sreg_64_xexec = COPY %0.sub0
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64_xexec = S_MOV_B32 0
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64_xexec = COPY [[S_MOV_B32_]].sub0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   S_NOP 0, implicit %0.sub1
-  ; CHECK-NEXT:   S_ENDPGM 0, implicit %0
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]].sub1
+  ; CHECK-NEXT:   S_ENDPGM 0, implicit [[S_MOV_B32_]]
   bb.0:
     successors: %bb.1
 
diff --git a/llvm/test/CodeGen/AMDGPU/regcoalesce-keep-valid-lanes-implicit-def-bug39602.mir b/llvm/test/CodeGen/AMDGPU/regcoalesce-keep-valid-lanes-implicit-def-bug39602.mir
index 1c4900ae85f5..18eb5586fdec 100644
--- a/llvm/test/CodeGen/AMDGPU/regcoalesce-keep-valid-lanes-implicit-def-bug39602.mir
+++ b/llvm/test/CodeGen/AMDGPU/regcoalesce-keep-valid-lanes-implicit-def-bug39602.mir
@@ -12,12 +12,12 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:sreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[DEF:%[0-9]+]].sub1:sreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   %0.sub0:sreg_64 = S_MOV_B32 0
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY %0
-  ; CHECK-NEXT:   dead %0.sub1:sreg_64 = COPY %0.sub0
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]].sub0:sreg_64 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY [[DEF]]
+  ; CHECK-NEXT:   dead [[DEF:%[0-9]+]].sub1:sreg_64 = COPY [[DEF]].sub0
   ; CHECK-NEXT:   S_ENDPGM 0, implicit [[COPY]].sub1
   bb.0:
     successors: %bb.1
@@ -41,12 +41,12 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:sreg_64 = S_MOV_B32 -1
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 -1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   %0.sub0:sreg_64 = S_MOV_B32 0
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY %0
-  ; CHECK-NEXT:   dead %0.sub1:sreg_64 = COPY %0.sub0
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY [[S_MOV_B32_]]
+  ; CHECK-NEXT:   dead [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = COPY [[S_MOV_B32_]].sub0
   ; CHECK-NEXT:   S_ENDPGM 0, implicit [[COPY]].sub1
   bb.0:
     successors: %bb.1
diff --git a/llvm/test/CodeGen/AMDGPU/regcoalescer-resolve-lane-conflict-by-subranges.mir b/llvm/test/CodeGen/AMDGPU/regcoalescer-resolve-lane-conflict-by-subranges.mir
index 82983aa11751..d0245ff1a73a 100644
--- a/llvm/test/CodeGen/AMDGPU/regcoalescer-resolve-lane-conflict-by-subranges.mir
+++ b/llvm/test/CodeGen/AMDGPU/regcoalescer-resolve-lane-conflict-by-subranges.mir
@@ -21,11 +21,11 @@ body:             |
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_]].sub0:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub0, [[GLOBAL_LOAD_DWORDX4_]].sub1, implicit $exec
+  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub0, [[GLOBAL_LOAD_DWORDX4_]].sub1, implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.2
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
-  ; GCN-NEXT:   dead %3:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORDX4_]].sub2, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
+  ; GCN-NEXT:   dead [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORDX4_]].sub2, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
   ; GCN-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1, %bb.2
@@ -71,7 +71,7 @@ body:             |
   ; GCN-NEXT:   S_BRANCH %bb.2
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
-  ; GCN-NEXT:   dead %3:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORDX4_]].sub2, [[COPY1]], implicit $exec
+  ; GCN-NEXT:   dead [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORDX4_]].sub2, [[COPY1]], implicit $exec
   ; GCN-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1, %bb.2
@@ -112,12 +112,12 @@ body:             |
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_]].sub0:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub0, [[GLOBAL_LOAD_DWORDX4_]].sub1, implicit $exec
-  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_]].sub1:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub0, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
+  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub0, [[GLOBAL_LOAD_DWORDX4_]].sub1, implicit $exec
+  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]].sub1:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub0, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.2
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
-  ; GCN-NEXT:   dead %3:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORDX4_]].sub1, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
+  ; GCN-NEXT:   dead [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORDX4_]].sub1, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
   ; GCN-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1, %bb.2
@@ -133,7 +133,6 @@ body:             |
     successors: %bb.2
 
     %2:vgpr_32 = V_AND_B32_e64 %1.sub0, %1.sub1, implicit $exec
-    ; %1.sub1 was re-defined
     %1.sub1:vreg_128 = V_AND_B32_e64 %2, %2, implicit $exec
     S_BRANCH %bb.2
 
@@ -161,13 +160,13 @@ body:             |
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_]].sub0:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub1, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
-  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_]].sub2:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub0, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
+  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub1, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
+  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]].sub2:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub0, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
   ; GCN-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.2
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
-  ; GCN-NEXT:   dead %3:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORDX4_]].sub1, [[GLOBAL_LOAD_DWORDX4_]].sub2, implicit $exec
+  ; GCN-NEXT:   dead [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORDX4_]].sub1, [[GLOBAL_LOAD_DWORDX4_]].sub2, implicit $exec
   ; GCN-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1, %bb.2
diff --git a/llvm/test/CodeGen/AMDGPU/remat-dead-subreg.mir b/llvm/test/CodeGen/AMDGPU/remat-dead-subreg.mir
index 84649f956369..c7fa879187cc 100644
--- a/llvm/test/CodeGen/AMDGPU/remat-dead-subreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat-dead-subreg.mir
@@ -16,9 +16,9 @@ body:             |
     ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
     ; GCN-NEXT: renamable $sgpr2 = S_MOV_B32 2, implicit $m0
     ; GCN-NEXT: renamable $sgpr1 = S_MOV_B32 3, implicit $m0
-    ; GCN-NEXT: dead %4:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr2, implicit killed $sgpr1
+    ; GCN-NEXT: dead [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr2, implicit killed $sgpr1
     ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: dead %5:vgpr_32 = V_MOV_B32_e32 killed $sgpr1, implicit $exec
+    ; GCN-NEXT: dead [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 killed $sgpr1, implicit $exec
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
     $m0 = IMPLICIT_DEF
     %0:sreg_64_xexec = S_MOV_B64 1, implicit $m0
@@ -41,9 +41,9 @@ body:             |
     ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr2, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
     ; GCN-NEXT: renamable $sgpr2 = S_MOV_B32 3, implicit $m0
     ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: dead %4:vgpr_32 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit killed $sgpr0, implicit killed $sgpr2
+    ; GCN-NEXT: dead [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit killed $sgpr0, implicit killed $sgpr2
     ; GCN-NEXT: renamable $sgpr0 = S_MUL_I32 renamable $sgpr5, 3
-    ; GCN-NEXT: dead %5:vgpr_32 = V_MOV_B32_e32 killed $sgpr0, implicit $exec
+    ; GCN-NEXT: dead [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 killed $sgpr0, implicit $exec
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr5
     $m0 = IMPLICIT_DEF
     %0:sreg_64_xexec = S_MOV_B64 1, implicit $m0
@@ -66,9 +66,9 @@ body:             |
     ; GCN-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sp_reg :: (store (s64) into %stack.0, align 4, addrspace 5)
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_MOV_B64 3, implicit $m0
     ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_MOV_B64 2, implicit $m0
-    ; GCN-NEXT: dead %5:vgpr_32 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit killed $sgpr0_sgpr1, implicit killed $sgpr4_sgpr5
+    ; GCN-NEXT: dead [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit killed $sgpr0_sgpr1, implicit killed $sgpr4_sgpr5
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.0, align 4, addrspace 5)
-    ; GCN-NEXT: dead %6:vreg_64 = V_MOV_B64_PSEUDO killed $sgpr4_sgpr5, implicit $exec
+    ; GCN-NEXT: dead [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO killed $sgpr4_sgpr5, implicit $exec
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr8, implicit renamable $sgpr11
     %0:sreg_64 = IMPLICIT_DEF
     %1:sgpr_128 = S_LOAD_DWORDX4_IMM %0, 1, 0
@@ -91,9 +91,9 @@ body:             |
     ; GCN-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr2_sgpr3, %stack.0, implicit $exec, implicit $sp_reg :: (store (s64) into %stack.0, align 4, addrspace 5)
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_MOV_B64 2, implicit $m0
     ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_MOV_B64 3, implicit $m0
-    ; GCN-NEXT: dead %4:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr4_sgpr5, implicit killed $sgpr2_sgpr3
+    ; GCN-NEXT: dead [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr4_sgpr5, implicit killed $sgpr2_sgpr3
     ; GCN-NEXT: renamable $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.0, align 4, addrspace 5)
-    ; GCN-NEXT: dead %5:vreg_64 = V_MOV_B64_PSEUDO killed $sgpr2_sgpr3, implicit $exec
+    ; GCN-NEXT: dead [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO killed $sgpr2_sgpr3, implicit $exec
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
     $m0 = IMPLICIT_DEF
     %0:sreg_64_xexec = S_MOV_B64 1, implicit $m0
diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
index f087ae458454..3f88f98e3437 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
@@ -21,28 +21,28 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $sgpr6_sgpr7
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub3:vreg_512 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 0, [[V_MOV_B32_e32_]], implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_512 = COPY %0
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub3:vreg_512 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 0, [[V_MOV_B32_e32_1]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_512 = COPY [[V_MOV_B32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   BUFFER_STORE_DWORD_OFFEN %0.sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32), align 8, addrspace 5)
+  ; CHECK-NEXT:   BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]].sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32), align 8, addrspace 5)
   ; CHECK-NEXT:   dead [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %7:vgpr_32, 0, 0, implicit $exec
-  ; CHECK-NEXT:   dead [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[V_MOV_B32_e32_]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   dead [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec
   ; CHECK-NEXT:   dead [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[V_ADD_U32_e32_]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-  ; CHECK-NEXT:   undef %11.sub1:vreg_512 = COPY [[COPY]].sub1
+  ; CHECK-NEXT:   undef [[COPY2:%[0-9]+]].sub1:vreg_512 = COPY [[COPY]].sub1
   ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def dead [[COPY1]], 851978 /* regdef:VGPR_16 */, def dead [[COPY]].sub1, 2147483657 /* reguse tiedto:$0 */, [[COPY1]], 2147549193 /* reguse tiedto:$1 */, [[COPY]].sub1
-  ; CHECK-NEXT:   %11.sub0:vreg_512 = COPY [[COPY]].sub0
-  ; CHECK-NEXT:   %11.sub3:vreg_512 = COPY [[COPY]].sub3
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   dead [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 4, [[V_MOV_B32_e32_1]], implicit-def dead $vcc, implicit $exec
-  ; CHECK-NEXT:   %11.sub2:vreg_512 = COPY undef [[V_MOV_B32_e32_]]
-  ; CHECK-NEXT:   %11.sub5:vreg_512 = COPY undef [[V_MOV_B32_e32_]]
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_512 = COPY %11
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]].sub0:vreg_512 = COPY [[COPY]].sub0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]].sub3:vreg_512 = COPY [[COPY]].sub3
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   dead [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 4, [[V_MOV_B32_e32_2]], implicit-def dead $vcc, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]].sub2:vreg_512 = COPY undef [[V_MOV_B32_e32_1]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]].sub5:vreg_512 = COPY undef [[V_MOV_B32_e32_1]]
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_512 = COPY [[COPY2]]
   ; CHECK-NEXT:   S_BRANCH %bb.1
   bb.0:
     liveins: $sgpr6_sgpr7
diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
index e4f56cc328e4..add7825a224e 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
@@ -27,33 +27,33 @@ body:             |
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 8, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_]]
-  ; CHECK-NEXT:   undef %6.sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   dead undef %6.sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_F32_e32_:%[0-9]+]].sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead undef [[V_ADD_F32_e32_:%[0-9]+]].sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   undef %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   %4.sub1:vreg_64 = V_ADD_U32_e32 [[COPY]], [[COPY]], implicit $exec
-  ; CHECK-NEXT:   undef %19.sub1:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], implicit $mode, implicit $exec
-  ; CHECK-NEXT:   %19.sub0:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD1]], [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   GLOBAL_STORE_DWORDX2 %19, %4, 32, 0, implicit $exec
-  ; CHECK-NEXT:   undef %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF1]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = V_ADD_U32_e32 [[COPY]], [[COPY]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_F32_e32_1:%[0-9]+]].sub1:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]].sub0:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD1]], [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORDX2 [[V_ADD_F32_e32_1]], [[V_MOV_B32_e32_]], 32, 0, implicit $exec
+  ; CHECK-NEXT:   undef [[GLOBAL_LOAD_DWORD2:%[0-9]+]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF1]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF2]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   %11.sub1:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD2:%[0-9]+]].sub1:vreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead %20:vgpr_32 = GLOBAL_LOAD_DWORD %11, 0, 0, implicit $exec
-  ; CHECK-NEXT:   dead %21:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF4]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   dead %22:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF5]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   dead [[GLOBAL_LOAD_DWORD3:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[GLOBAL_LOAD_DWORD2]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   dead [[GLOBAL_LOAD_DWORD4:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF4]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   dead [[GLOBAL_LOAD_DWORD5:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF5]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64_e64 2, [[DEF2]], implicit $exec
   ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   S_NOP 0, implicit [[DEF7]], implicit [[V_LSHLREV_B64_e64_]].sub0, implicit [[DEF6]], implicit [[V_MOV_B32_e32_]]
-  ; CHECK-NEXT:   GLOBAL_STORE_DWORD [[DEF5]], [[V_MOV_B32_e32_1]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64_e64 2, [[DEF2]], implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   S_NOP 0, implicit [[DEF7]], implicit [[V_LSHLREV_B64_e64_]].sub0, implicit [[DEF6]], implicit [[V_MOV_B32_e32_1]]
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORD [[DEF5]], [[V_MOV_B32_e32_2]], 0, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
index 4c32af8b0100..6d79837feb12 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
@@ -29,9 +29,9 @@ body:             |
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %2:vgpr_32, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr101, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sgpr_64 = S_LOAD_DWORDX2_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
   ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 5329
-  ; CHECK-NEXT:   undef %5.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4, implicit $exec
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
@@ -41,42 +41,42 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def dead %11
   ; CHECK-NEXT:   GLOBAL_STORE_DWORD undef %12:vreg_64, [[BUFFER_LOAD_DWORD_OFFEN]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   %5.sub1:vreg_64 = COPY [[V_MOV_B32_e32_]]
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_1]]
   ; CHECK-NEXT:   [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 3)
   ; CHECK-NEXT:   INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def %15, 851978 /* regdef:VGPR_16 */, def %16
-  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_2]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec
   ; CHECK-NEXT:   INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def %21, 851978 /* regdef:VGPR_16 */, def %22
-  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def dead [[V_MOV_B32_e32_2]], 851978 /* regdef:VGPR_16 */, def dead [[V_MOV_B32_e32_3]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_2]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_3]](tied-def 5), 851977 /* reguse:VGPR_16 */, %15, 851977 /* reguse:VGPR_16 */, %16, 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_1]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_3]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_2]]
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_2]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def dead [[V_MOV_B32_e32_3]], 851978 /* regdef:VGPR_16 */, def dead [[V_MOV_B32_e32_4]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_3]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_4]](tied-def 5), 851977 /* reguse:VGPR_16 */, %15, 851977 /* reguse:VGPR_16 */, %16, 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_1]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_3]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_2]]
   ; CHECK-NEXT:   DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store (s32), addrspace 3)
   ; CHECK-NEXT:   DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store (s32), addrspace 3)
-  ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %30:vgpr_32, %5, 0, 0, implicit $exec :: (store (s64), addrspace 3)
-  ; CHECK-NEXT:   undef %31.sub1:vreg_64 = FLAT_LOAD_DWORD undef %32:vreg_64, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+  ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %30:vgpr_32, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (store (s64), addrspace 3)
+  ; CHECK-NEXT:   undef [[FLAT_LOAD_DWORD:%[0-9]+]].sub1:vreg_64 = FLAT_LOAD_DWORD undef %32:vreg_64, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
   ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 1, [[DEF2]], implicit $exec
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-  ; CHECK-NEXT:   [[DEF]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_1]]
   ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_ADD_U32_e32_]], [[S_MOV_B32_]], implicit $exec
   ; CHECK-NEXT:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_GT_U32_e64 64, [[V_ADD_U32_e32_]], implicit $exec
-  ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_ADD_U32_e32_]], [[V_CMP_GT_U32_e64_]], implicit $exec
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_ADD_U32_e32_]], [[V_CMP_GT_U32_e64_]], implicit $exec
   ; CHECK-NEXT:   [[V_SUB_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 [[V_MUL_LO_U32_e64_]], [[DEF1]], implicit $exec
-  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_CNDMASK_B32_e64_]], [[S_MOV_B32_]], implicit $exec
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[DEF2]], [[S_MOV_B32_]], implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_SUB_U32_e32_]], [[DEF]].sub0, implicit $exec
-  ; CHECK-NEXT:   [[V_SUB_U32_e32_1:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 [[V_MUL_LO_U32_e64_1]], [[V_MUL_LO_U32_e64_]], implicit $exec
-  ; CHECK-NEXT:   [[DEF]].sub0:vreg_64 = V_ADD_U32_e32 [[V_SUB_U32_e32_1]], [[V_ADD_U32_e32_1]], implicit $exec
-  ; CHECK-NEXT:   undef %38.sub0:vreg_64, %39:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[DEF]].sub0, 0, implicit $exec
-  ; CHECK-NEXT:   undef %40.sub1:vreg_64, dead %41:sreg_64_xexec = V_ADDC_U32_e64 [[COPY1]], [[DEF]].sub1, %39, 0, implicit $exec
-  ; CHECK-NEXT:   undef %42.sub0:sgpr_64 = V_READFIRSTLANE_B32 %38.sub0, implicit $exec
-  ; CHECK-NEXT:   %42.sub1:sgpr_64 = V_READFIRSTLANE_B32 %40.sub1, implicit $exec
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %42, 0, 0 :: (load (s32), addrspace 1)
+  ; CHECK-NEXT:   [[V_SUB_U32_e32_1:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 [[DEF1]], [[V_MUL_LO_U32_e64_]], implicit $exec
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]].sub0:vreg_64 = V_ADD_U32_e32 [[V_SUB_U32_e32_1]], [[V_ADD_U32_e32_1]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub0:vreg_64, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[DEF]].sub0, 0, implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADDC_U32_e64_:%[0-9]+]].sub1:vreg_64, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY1]], [[DEF]].sub1, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+  ; CHECK-NEXT:   undef [[V_READFIRSTLANE_B32_:%[0-9]+]].sub0:sgpr_64 = V_READFIRSTLANE_B32 [[V_ADD_CO_U32_e64_]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]].sub1:sgpr_64 = V_READFIRSTLANE_B32 [[V_ADDC_U32_e64_]].sub1, implicit $exec
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[V_READFIRSTLANE_B32_]], 0, 0 :: (load (s32), addrspace 1)
   ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_4:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %45:vgpr_32, 0, 0, implicit $exec :: (load (s32), addrspace 3)
   ; CHECK-NEXT:   GLOBAL_STORE_DWORD undef %46:vreg_64, [[DS_READ_B32_gfx9_4]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
-  ; CHECK-NEXT:   %31.sub0:vreg_64 = COPY [[S_LOAD_DWORD_IMM]], implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %47:vgpr_32, %31, 0, 0, implicit $exec :: (store (s64), addrspace 3)
+  ; CHECK-NEXT:   [[FLAT_LOAD_DWORD:%[0-9]+]].sub0:vreg_64 = COPY [[S_LOAD_DWORD_IMM]], implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %47:vgpr_32, [[FLAT_LOAD_DWORD]], 0, 0, implicit $exec :: (store (s64), addrspace 3)
   ; CHECK-NEXT:   S_BRANCH %bb.1
   bb.0:
     liveins: $sgpr4_sgpr5
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir
index 937b43f45558..0b1fd441256d 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir
+++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir
@@ -63,7 +63,7 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32))
-    ; CHECK-NEXT: dead %3:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, implicit $exec :: (load (s32))
+    ; CHECK-NEXT: dead [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, implicit $exec :: (load (s32))
     ; CHECK-NEXT: S_DENORM_MODE 0, implicit-def $mode, implicit $mode
     ; CHECK-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 0, [[GLOBAL_LOAD_DWORD]], implicit $mode, implicit $exec
     ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORD]], [[V_ADD_F32_e32_]], implicit $exec
@@ -89,7 +89,7 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32))
-    ; CHECK-NEXT: dead %3:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, implicit $exec :: (load (s32))
+    ; CHECK-NEXT: dead [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, implicit $exec :: (load (s32))
     ; CHECK-NEXT: S_ROUND_MODE 0, implicit-def $mode, implicit $mode
     ; CHECK-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 0, [[GLOBAL_LOAD_DWORD]], implicit $mode, implicit $exec
     ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORD]], [[V_ADD_F32_e32_]], implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir
index 4eebd8f2e574..9429d1565962 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir
+++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir
@@ -13,31 +13,31 @@ body: |
     ; CHECK-LABEL: name: test
     ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: undef %0.sub3:vreg_128 = COPY $vgpr9
-    ; CHECK-NEXT: undef %1.sub2:vreg_128 = COPY $vgpr8
-    ; CHECK-NEXT: undef %2.sub1:vreg_128 = COPY $vgpr7
-    ; CHECK-NEXT: undef %3.sub0:vreg_128 = COPY $vgpr6
-    ; CHECK-NEXT: undef %4.sub3:vreg_128 = COPY $vgpr5
-    ; CHECK-NEXT: undef %5.sub2:vreg_128 = COPY $vgpr4
-    ; CHECK-NEXT: undef %8.sub1:vreg_64 = COPY $vgpr1
-    ; CHECK-NEXT: %8.sub0:vreg_64 = COPY $vgpr0
-    ; CHECK-NEXT: undef %6.sub1:vreg_128 = COPY $vgpr3
-    ; CHECK-NEXT: undef %7.sub0:vreg_128 = COPY $vgpr2
-    ; CHECK-NEXT: undef %9.sub0:sgpr_128 = V_READFIRSTLANE_B32 %7.sub0, implicit $exec
-    ; CHECK-NEXT: %9.sub1:sgpr_128 = V_READFIRSTLANE_B32 %6.sub1, implicit $exec
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub3:vreg_128 = COPY $vgpr9
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub2:vreg_128 = COPY $vgpr8
+    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub1:vreg_128 = COPY $vgpr7
+    ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_128 = COPY $vgpr6
+    ; CHECK-NEXT: undef [[COPY4:%[0-9]+]].sub3:vreg_128 = COPY $vgpr5
+    ; CHECK-NEXT: undef [[COPY5:%[0-9]+]].sub2:vreg_128 = COPY $vgpr4
+    ; CHECK-NEXT: undef [[COPY6:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+    ; CHECK-NEXT: undef [[COPY7:%[0-9]+]].sub1:vreg_128 = COPY $vgpr3
+    ; CHECK-NEXT: undef [[COPY8:%[0-9]+]].sub0:vreg_128 = COPY $vgpr2
+    ; CHECK-NEXT: undef [[V_READFIRSTLANE_B32_:%[0-9]+]].sub0:sgpr_128 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec
+    ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub1:sgpr_128 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec
     ; CHECK-NEXT: S_BARRIER
-    ; CHECK-NEXT: %9.sub2:sgpr_128 = V_READFIRSTLANE_B32 %5.sub2, implicit $exec
-    ; CHECK-NEXT: %9.sub3:sgpr_128 = V_READFIRSTLANE_B32 %4.sub3, implicit $exec
-    ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %9, 0, 0, 0, 0, implicit $exec
-    ; CHECK-NEXT: undef %12.sub0:sgpr_128 = V_READFIRSTLANE_B32 %3.sub0, implicit $exec
-    ; CHECK-NEXT: %12.sub1:sgpr_128 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec
-    ; CHECK-NEXT: %12.sub2:sgpr_128 = V_READFIRSTLANE_B32 %1.sub2, implicit $exec
-    ; CHECK-NEXT: %12.sub3:sgpr_128 = V_READFIRSTLANE_B32 %0.sub3, implicit $exec
-    ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %12, 0, 0, 0, 0, implicit $exec
+    ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub2:sgpr_128 = V_READFIRSTLANE_B32 [[COPY5]].sub2, implicit $exec
+    ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub3:sgpr_128 = V_READFIRSTLANE_B32 [[COPY4]].sub3, implicit $exec
+    ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec
+    ; CHECK-NEXT: undef [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub0:sgpr_128 = V_READFIRSTLANE_B32 [[COPY3]].sub0, implicit $exec
+    ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub1:sgpr_128 = V_READFIRSTLANE_B32 [[COPY2]].sub1, implicit $exec
+    ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub2:sgpr_128 = V_READFIRSTLANE_B32 [[COPY1]].sub2, implicit $exec
+    ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub3:sgpr_128 = V_READFIRSTLANE_B32 [[COPY]].sub3, implicit $exec
+    ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[V_READFIRSTLANE_B32_1]], 0, 0, 0, 0, implicit $exec
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[BUFFER_LOAD_DWORD_OFFSET]], [[BUFFER_LOAD_DWORD_OFFSET]], implicit $exec
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[BUFFER_LOAD_DWORD_OFFSET1]], [[BUFFER_LOAD_DWORD_OFFSET1]], implicit $exec
     ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MUL_LO_U32_e64_]], [[V_MUL_LO_U32_e64_1]], implicit $exec
-    ; CHECK-NEXT: GLOBAL_STORE_DWORD %8, [[V_ADD_U32_e32_]], 0, 0, implicit $exec
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD [[COPY6]], [[V_ADD_U32_e32_]], 0, 0, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0
     undef %43.sub3:vreg_128 = COPY $vgpr9
     undef %42.sub2:vreg_128 = COPY $vgpr8
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
index f93456ccacb8..4c61e6803feb 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
@@ -160,3 +160,60 @@ body:             |
     S_ENDPGM 0
 
 ...
+---
+name:            add_f16_u32_preserve_different_bb
+tracksRegLiveness: true
+body:             |
+  ; SDWA-LABEL: name: add_f16_u32_preserve_different_bb
+  ; SDWA: bb.0:
+  ; SDWA-NEXT:   successors: %bb.1(0x80000000)
+  ; SDWA-NEXT:   liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31
+  ; SDWA-NEXT: {{  $}}
+  ; SDWA-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
+  ; SDWA-NEXT:   [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+  ; SDWA-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+  ; SDWA-NEXT:   [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+  ; SDWA-NEXT:   [[FLAT_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+  ; SDWA-NEXT:   [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 65535, [[FLAT_LOAD_DWORD]], implicit $exec
+  ; SDWA-NEXT:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[FLAT_LOAD_DWORD1]], implicit $exec
+  ; SDWA-NEXT:   [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[FLAT_LOAD_DWORD]], 8, 8, implicit $exec
+  ; SDWA-NEXT:   [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 24, [[FLAT_LOAD_DWORD1]], implicit $exec
+  ; SDWA-NEXT: {{  $}}
+  ; SDWA-NEXT: bb.1:
+  ; SDWA-NEXT:   successors: %bb.2(0x80000000)
+  ; SDWA-NEXT: {{  $}}
+  ; SDWA-NEXT:   [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 5, 0, 1, 3, implicit $mode, implicit $exec
+  ; SDWA-NEXT: {{  $}}
+  ; SDWA-NEXT: bb.2:
+  ; SDWA-NEXT:   [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 2, 4, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F32_sdwa]](tied-def 0)
+  ; SDWA-NEXT:   FLAT_STORE_DWORD [[COPY2]], [[V_ADD_F16_sdwa]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+  ; SDWA-NEXT:   $sgpr30_sgpr31 = COPY [[COPY]]
+  ; SDWA-NEXT:   S_SETPC_B64_return $sgpr30_sgpr31
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31
+
+    %2:sreg_64 = COPY $sgpr30_sgpr31
+    %1:vreg_64 = COPY $vgpr2_vgpr3
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %3:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+    %4:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+
+    %5:vgpr_32 = V_AND_B32_e32 65535, %3, implicit $exec
+    %6:vgpr_32 = V_LSHRREV_B32_e64 16, %4, implicit $exec
+    %7:vgpr_32 = V_BFE_U32_e64 %3, 8, 8, implicit $exec
+    %8:vgpr_32 = V_LSHRREV_B32_e32 24, %4, implicit $exec
+
+    %9:vgpr_32 = V_ADD_F16_e64 0, %5, 0, %6, 0, 0, implicit $mode, implicit $exec
+    %10:vgpr_32 = V_LSHLREV_B16_e64 8, %9, implicit $exec
+
+  bb.1:
+    %11:vgpr_32 = V_MUL_F32_e64 0, %7, 0, %8, 0, 0, implicit $mode, implicit $exec
+    %12:vgpr_32 = V_LSHLREV_B32_e64 16, %11, implicit $exec
+
+  bb.2:
+    %13:vgpr_32 = V_OR_B32_e64 %10, %12, implicit $exec
+
+    FLAT_STORE_DWORD %0, %13, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    $sgpr30_sgpr31 = COPY %2
+    S_SETPC_B64_return $sgpr30_sgpr31
+...
diff --git a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir
index b5beabd287c2..796a70cfe8a3 100644
--- a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir
+++ b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir
@@ -8,9 +8,9 @@ body:             |
 
     ; GCN-LABEL: name: simple
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -26,10 +26,10 @@ body:             |
 
     ; GCN-LABEL: name: salu_in_between
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $sgpr0 = S_MOV_B32 $sgpr2
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr2
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -46,12 +46,12 @@ body:             |
 
     ; GCN-LABEL: name: valu_write_in_between
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: $vgpr20 = V_MOV_B32_indirect_read 1, implicit $exec, implicit $m0
-    ; GCN: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr20 = V_MOV_B32_indirect_read 1, implicit $exec, implicit $m0
+    ; GCN-NEXT: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -68,12 +68,12 @@ body:             |
 
     ; GCN-LABEL: name: valu_read_in_between
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: V_NOP_e32 implicit $exec, implicit $vgpr0
-    ; GCN: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: V_NOP_e32 implicit $exec, implicit $vgpr0
+    ; GCN-NEXT: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -90,12 +90,12 @@ body:             |
 
     ; GCN-LABEL: name: changed_index
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: $sgpr2 = S_MOV_B32 1
-    ; GCN: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $sgpr2 = S_MOV_B32 1
+    ; GCN-NEXT: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -112,12 +112,12 @@ body:             |
 
     ; GCN-LABEL: name: implicitly_changed_index
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: S_NOP 0, implicit-def $sgpr2
-    ; GCN: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit-def $sgpr2
+    ; GCN-NEXT: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -134,12 +134,12 @@ body:             |
 
     ; GCN-LABEL: name: changed_m0
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: $m0 = S_MOV_B32 1
-    ; GCN: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $m0 = S_MOV_B32 1
+    ; GCN-NEXT: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -156,12 +156,12 @@ body:             |
 
     ; GCN-LABEL: name: implicitly_changed_m0
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: S_NOP 0, implicit-def $m0
-    ; GCN: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit-def $m0
+    ; GCN-NEXT: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -178,9 +178,9 @@ body:             |
 
     ; GCN-LABEL: name: same_imm_index
     ; GCN: S_SET_GPR_IDX_ON 1, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON 1, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -196,11 +196,11 @@ body:             |
 
     ; GCN-LABEL: name: different_imm_index
     ; GCN: S_SET_GPR_IDX_ON 1, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: S_SET_GPR_IDX_ON 2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: S_SET_GPR_IDX_ON 2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON 1, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -216,11 +216,11 @@ body:             |
 
     ; GCN-LABEL: name: different_gpr_index
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: S_SET_GPR_IDX_ON killed $sgpr1, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: S_SET_GPR_IDX_ON killed $sgpr1, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -236,14 +236,14 @@ body:             |
 
     ; GCN-LABEL: name: different_gpr_index_then_same_index
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: S_SET_GPR_IDX_ON $sgpr1, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: S_SET_GPR_IDX_ON $sgpr1, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -262,11 +262,11 @@ body:             |
 
     ; GCN-LABEL: name: use_m0_with_idx_off
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -282,10 +282,10 @@ body:             |
 
     ; GCN-LABEL: name: three_in_a_row
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $vgpr17 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $vgpr18 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: $vgpr17 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: $vgpr18 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -304,12 +304,12 @@ body:             |
 
     ; GCN-LABEL: name: different_gpr_index_then_two_same_indexes
     ; GCN: S_SET_GPR_IDX_ON $sgpr1, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr1, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -328,12 +328,12 @@ body:             |
 
     ; GCN-LABEL: name: two_same_indexes_then_different
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: S_SET_GPR_IDX_ON killed $sgpr1, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: S_SET_GPR_IDX_ON killed $sgpr1, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -352,10 +352,10 @@ body:             |
 
     ; GCN-LABEL: name: indirect_mov
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: V_MOV_B32_indirect_write undef $vgpr0, undef $vgpr3, implicit $exec, implicit $m0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3(tied-def 4)
-    ; GCN: V_MOV_B32_indirect_write undef $vgpr0, undef $vgpr3, implicit $exec, implicit $m0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3(tied-def 4)
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: V_MOV_B32_indirect_write undef $vgpr0, undef $vgpr3, implicit $exec, implicit $m0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3(tied-def 4)
+    ; GCN-NEXT: V_MOV_B32_indirect_write undef $vgpr0, undef $vgpr3, implicit $exec, implicit $m0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3(tied-def 4)
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   V_MOV_B32_indirect_write undef $vgpr0, undef $vgpr3, implicit $exec, implicit $m0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3(tied-def 4)
@@ -371,13 +371,13 @@ body:             |
   bb.0:
     ; GCN-LABEL: name: simple_bundle
     ; GCN: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
-    ; GCN:   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
-    ; GCN:   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: }
-    ; GCN: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
-    ; GCN:   $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: }
+    ; GCN-NEXT:   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
+    ; GCN-NEXT:   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: }
+    ; GCN-NEXT: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
+    ; GCN-NEXT:   $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: }
   BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
     S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
     $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -396,14 +396,14 @@ body:             |
   bb.0:
     ; GCN-LABEL: name: salu_in_between_bundle
     ; GCN: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
-    ; GCN:   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
-    ; GCN:   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: }
-    ; GCN: $sgpr0 = S_MOV_B32 $sgpr2
-    ; GCN: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
-    ; GCN:   $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: }
+    ; GCN-NEXT:   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
+    ; GCN-NEXT:   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: }
+    ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr2
+    ; GCN-NEXT: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
+    ; GCN-NEXT:   $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: }
   BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
     S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
     $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -423,16 +423,16 @@ body:             |
   bb.0:
     ; GCN-LABEL: name: valu_in_between_bundle
     ; GCN: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
-    ; GCN:   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
-    ; GCN:   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: }
-    ; GCN: $vgpr20 = V_MOV_B32_indirect_read 1, implicit $exec, implicit $m0
-    ; GCN: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
-    ; GCN:   S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
-    ; GCN:   $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: }
+    ; GCN-NEXT:   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
+    ; GCN-NEXT:   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: }
+    ; GCN-NEXT: $vgpr20 = V_MOV_B32_indirect_read 1, implicit $exec, implicit $m0
+    ; GCN-NEXT: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
+    ; GCN-NEXT:   S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
+    ; GCN-NEXT:   $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: }
   BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
     S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
     $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -452,16 +452,16 @@ body:             |
   bb.0:
     ; GCN-LABEL: name: changed_index_bundle
     ; GCN: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
-    ; GCN:   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
-    ; GCN:   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: }
-    ; GCN: $sgpr2 = S_MOV_B32 1
-    ; GCN: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
-    ; GCN:   S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
-    ; GCN:   $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: }
+    ; GCN-NEXT:   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
+    ; GCN-NEXT:   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: }
+    ; GCN-NEXT: $sgpr2 = S_MOV_B32 1
+    ; GCN-NEXT: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
+    ; GCN-NEXT:   S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
+    ; GCN-NEXT:   $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: }
   BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
     S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
     $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -480,13 +480,15 @@ name:            simple_cbranch_vccz
 body:             |
   ; GCN-LABEL: name: simple_cbranch_vccz
   ; GCN: bb.0:
-  ; GCN:   successors: %bb.1(0x80000000)
-  ; GCN:   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
-  ; GCN:   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-  ; GCN:   $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-  ; GCN:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-  ; GCN:   S_CBRANCH_VCCZ %bb.1, implicit $vcc
-  ; GCN: bb.1:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
+  ; GCN-NEXT:   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  ; GCN-NEXT:   $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  ; GCN-NEXT:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+  ; GCN-NEXT:   S_CBRANCH_VCCZ %bb.1, implicit $vcc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
   bb.0:
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -503,12 +505,14 @@ name:            simple_cbranch_execz
 body:             |
   ; GCN-LABEL: name: simple_cbranch_execz
   ; GCN: bb.0:
-  ; GCN:   successors:
-  ; GCN:   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
-  ; GCN:   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-  ; GCN:   $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-  ; GCN:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-  ; GCN: bb.1:
+  ; GCN-NEXT:   successors:
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
+  ; GCN-NEXT:   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  ; GCN-NEXT:   $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  ; GCN-NEXT:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
   bb.0:
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-instructions-flags.mir b/llvm/test/CodeGen/AMDGPU/shrink-instructions-flags.mir
index 07a6b4d3ea0f..37d7a758d6fd 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-instructions-flags.mir
+++ b/llvm/test/CodeGen/AMDGPU/shrink-instructions-flags.mir
@@ -15,7 +15,7 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: %2:vgpr_32 = nnan nofpexcept V_ADD_F32_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = nnan nofpexcept V_ADD_F32_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; CHECK-NEXT: S_NOP 0
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir
index f4c13b51157d..ed2148ab5a19 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir
+++ b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir
@@ -11,6 +11,7 @@ body: |
     ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: mad_cvv_f32
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -31,6 +32,7 @@ body: |
     ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: mad_vcv_f32
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -51,6 +53,7 @@ body: |
     ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: mad_vvc_f32
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -71,6 +74,7 @@ body: |
     ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: mad_vsc_f32
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF
@@ -91,6 +95,7 @@ body: |
     ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: fma_cvv_f32
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -111,6 +116,7 @@ body: |
     ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: fma_vcv_f32
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -131,6 +137,7 @@ body: |
     ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: fma_vvc_f32
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -151,6 +158,7 @@ body: |
     ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: fma_vsc_f32
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF
@@ -171,6 +179,7 @@ body: |
     ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: mad_cvv_f16
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -191,6 +200,7 @@ body: |
     ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: mad_vcv_f16
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -211,6 +221,7 @@ body: |
     ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: mad_vvc_f16
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -231,6 +242,7 @@ body: |
     ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: mad_vsc_f16
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF
@@ -251,6 +263,7 @@ body: |
     ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_FMAMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: fma_cvv_f16
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -271,6 +284,7 @@ body: |
     ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_FMAMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: fma_vcv_f16
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -291,6 +305,7 @@ body: |
     ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_FMAAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: fma_vvc_f16
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -311,6 +326,7 @@ body: |
     ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_FMAAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: fma_vsc_f16
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir
index 90924b3345fa..a54c0accce78 100644
--- a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir
@@ -16,6 +16,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]], 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: aligned_vgpr_64
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
@@ -40,6 +41,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]], 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: unaligned_vgpr_64
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
@@ -65,6 +67,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_96_align2 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]].sub0_sub1, 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: aligned_vgpr_96_sub0_subg1
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
@@ -89,6 +92,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_96_align2 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]].sub1_sub2, 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: unaligned_vgpr_96_sub1_sub2
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
@@ -114,6 +118,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_96_align2 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[DEF]], 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: aligned_vgpr_96
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
@@ -138,6 +143,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_96 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[DEF]], 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: unaligned_vgpr_96
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
@@ -163,6 +169,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]].sub0_sub1, 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: aligned_vgpr_128_sub0_sub1
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
@@ -187,6 +194,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]].sub2_sub3, 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: aligned_vgpr_128_sub2_sub3
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
@@ -211,6 +219,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]].sub1_sub2, 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: unaligned_vgpr_128_sub1_sub2
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
@@ -236,6 +245,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[DEF]].sub0_sub1_sub2, 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: aligned_vgpr_128_sub0_sub1_sub2
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
@@ -260,6 +270,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[DEF]].sub1_sub2_sub3, 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: unaligned_vgpr_128_sub1_sub2_sub3
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
@@ -285,6 +296,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[DEF]], 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: aligned_vgpr_128
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
@@ -309,6 +321,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[DEF]], 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: unaligned_vgpr_128
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-reg-sequence.mir b/llvm/test/CodeGen/AMDGPU/si-fold-reg-sequence.mir
new file mode 100644
index 000000000000..7852f5d0c96f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-reg-sequence.mir
@@ -0,0 +1,18 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=si-fold-operands -verify-machineinstrs -o - %s
+
+---
+name:            fold_reg_sequence
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2
+
+    %0:sreg_32 = S_MOV_B32 0
+    %1:sreg_32 = S_MOV_B32 429
+    %2:sreg_64 = REG_SEQUENCE killed %1, %subreg.sub0, %0, %subreg.sub1
+    %3:vgpr_32 = V_MUL_HI_U32_e64 $vgpr2, %2.sub0, implicit $exec
+    %4:vgpr_32 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    %5:vgpr_32 = V_MUL_HI_U32_e64 %4, %2.sub0, implicit $exec
+    S_ENDPGM 0
+
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir
index e342c2b83524..eddad05d976b 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir
@@ -11,7 +11,7 @@ body: |
     ; GCN: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
     ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 16, 0
     ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0 = S_AND_B32 [[S_LOAD_DWORD_IMM]], 255, implicit-def $scc
-    ; GCN-NEXT: dead %3:sreg_32_xm0 = S_AND_B32 65535, [[S_AND_B32_]], implicit-def $scc
+    ; GCN-NEXT: dead [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0 = S_AND_B32 65535, [[S_AND_B32_]], implicit-def $scc
     ; GCN-NEXT: S_ENDPGM 0
     %0:sgpr_64 = COPY $sgpr4_sgpr5
     %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 16, 0
@@ -30,7 +30,7 @@ body:             |
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
   ; GCN-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc
-  ; GCN-NEXT:   dead %0:sreg_64 = S_XOR_B64 [[S_AND_B64_]], [[COPY]], implicit-def dead $scc
+  ; GCN-NEXT:   dead [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_AND_B64_]], [[COPY]], implicit-def dead $scc
   ; GCN-NEXT:   $exec = S_MOV_B64_term [[S_AND_B64_]]
   ; GCN-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.1
@@ -195,7 +195,7 @@ body:             |
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.3:
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
-  ; GCN-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5, $sgpr8_sgpr9
+  ; GCN-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   S_SLEEP 3
   ; GCN-NEXT:   S_NOP 0
@@ -368,7 +368,7 @@ body:             |
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GCN-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec
   ; GCN-NEXT:   [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY2]], implicit $exec
-  ; GCN-NEXT:   dead %5:sreg_64_xexec = S_MOV_B64 0
+  ; GCN-NEXT:   dead [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.3(0x80000000)
@@ -383,7 +383,7 @@ body:             |
   ; GCN-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc
   ; GCN-NEXT:   [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY4]], implicit-def dead $scc
   ; GCN-NEXT:   $exec = S_MOV_B64_term [[S_AND_B64_]]
-  ; GCN-NEXT:   dead %8:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec
+  ; GCN-NEXT:   dead [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec
   ; GCN-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.mir b/llvm/test/CodeGen/AMDGPU/spill-agpr.mir
index 16f7e15f267e..981a853cb097 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.mir
@@ -30,6 +30,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   $agpr0 = SI_SPILL_A32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
   ; GFX908-SPILLED-NEXT:   $agpr1 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0, implicit killed renamable $agpr1
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr32
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -53,6 +54,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
   ; GFX908-EXPANDED-NEXT:   $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0, implicit killed renamable $agpr1
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr32
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -72,6 +74,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   $agpr0 = SI_SPILL_A32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   $agpr1 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0, implicit killed renamable $agpr1
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr32
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -130,6 +133,7 @@ body: |
   ; GFX908-SPILLED-NEXT: bb.2:
   ; GFX908-SPILLED-NEXT:   $agpr0_agpr1 = SI_SPILL_A64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr64
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -152,6 +156,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1
   ; GFX908-EXPANDED-NEXT:   $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr64
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -168,6 +173,7 @@ body: |
   ; GFX90A-SPILLED-NEXT: bb.2:
   ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1 = SI_SPILL_A64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr64
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -239,6 +245,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit undef $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit undef $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr32_used_all_vgprs
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -272,6 +279,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit undef $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit undef $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr32_used_all_vgprs
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -303,6 +311,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit undef $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit undef $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr32_used_all_vgprs
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -385,6 +394,7 @@ body: |
   ; GFX908-SPILLED-NEXT: bb.2:
   ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2 = SI_SPILL_A96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s96) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr96
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -409,6 +419,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX908-EXPANDED-NEXT:   $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr96
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -425,6 +436,7 @@ body: |
   ; GFX90A-SPILLED-NEXT: bb.2:
   ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2 = SI_SPILL_A96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s96) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr96
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -483,6 +495,7 @@ body: |
   ; GFX908-SPILLED-NEXT: bb.2:
   ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr128
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -509,6 +522,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX908-EXPANDED-NEXT:   $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr128
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -525,6 +539,7 @@ body: |
   ; GFX90A-SPILLED-NEXT: bb.2:
   ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr128
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -585,6 +600,7 @@ body: |
   ; GFX908-SPILLED-NEXT: bb.2:
   ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4 = SI_SPILL_A160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s160) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr160
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -613,6 +629,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX908-EXPANDED-NEXT:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr160
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -629,6 +646,7 @@ body: |
   ; GFX90A-SPILLED-NEXT: bb.2:
   ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4 = SI_SPILL_A160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s160) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr160
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -691,6 +709,7 @@ body: |
   ; GFX908-SPILLED-NEXT: bb.2:
   ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = SI_SPILL_A192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s192) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr192
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -721,6 +740,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX908-EXPANDED-NEXT:   $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr192
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -737,6 +757,7 @@ body: |
   ; GFX90A-SPILLED-NEXT: bb.2:
   ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = SI_SPILL_A192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s192) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr192
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -801,6 +822,7 @@ body: |
   ; GFX908-SPILLED-NEXT: bb.2:
   ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = SI_SPILL_A256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr256
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -835,6 +857,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX908-EXPANDED-NEXT:   $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr256
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -851,6 +874,7 @@ body: |
   ; GFX90A-SPILLED-NEXT: bb.2:
   ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = SI_SPILL_A256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr256
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -919,6 +943,7 @@ body: |
   ; GFX908-SPILLED-NEXT: bb.2:
   ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 = SI_SPILL_A288_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s288) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr288
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -955,6 +980,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX908-EXPANDED-NEXT:   $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr288
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -971,6 +997,7 @@ body: |
   ; GFX90A-SPILLED-NEXT: bb.2:
   ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 = SI_SPILL_A288_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s288) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr288
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -1041,6 +1068,7 @@ body: |
   ; GFX908-SPILLED-NEXT: bb.2:
   ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = SI_SPILL_A320_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s320) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr320
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -1079,6 +1107,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX908-EXPANDED-NEXT:   $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr320
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -1095,6 +1124,7 @@ body: |
   ; GFX90A-SPILLED-NEXT: bb.2:
   ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = SI_SPILL_A320_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s320) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr320
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -1167,6 +1197,7 @@ body: |
   ; GFX908-SPILLED-NEXT: bb.2:
   ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 = SI_SPILL_A352_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s352) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr352
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -1207,6 +1238,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX908-EXPANDED-NEXT:   $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr352
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -1223,6 +1255,7 @@ body: |
   ; GFX90A-SPILLED-NEXT: bb.2:
   ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 = SI_SPILL_A352_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s352) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr352
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -1297,6 +1330,7 @@ body: |
   ; GFX908-SPILLED-NEXT: bb.2:
   ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 = SI_SPILL_A384_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s384) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr384
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -1339,6 +1373,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX908-EXPANDED-NEXT:   $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr384
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -1355,6 +1390,7 @@ body: |
   ; GFX90A-SPILLED-NEXT: bb.2:
   ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 = SI_SPILL_A384_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s384) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr384
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -1431,6 +1467,7 @@ body: |
   ; GFX908-SPILLED-NEXT: bb.2:
   ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = SI_SPILL_A512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr512
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -1481,6 +1518,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX908-EXPANDED-NEXT:   $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr512
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -1497,6 +1535,7 @@ body: |
   ; GFX90A-SPILLED-NEXT: bb.2:
   ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = SI_SPILL_A512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr512
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -1581,6 +1620,7 @@ body: |
   ; GFX908-SPILLED-NEXT: bb.2:
   ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = SI_SPILL_A1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s1024) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr1024
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -1663,6 +1703,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr30 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX908-EXPANDED-NEXT:   $agpr31 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr1024
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -1679,6 +1720,7 @@ body: |
   ; GFX90A-SPILLED-NEXT: bb.2:
   ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = SI_SPILL_A1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s1024) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr1024
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
index 9bac6bbd9759..e54e5898f8b5 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
@@ -19,14 +19,14 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %5.sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec
-  ; CHECK-NEXT:   dead [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e32 0, %5.sub1, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   undef %7.sub1:vreg_64 = V_MAC_F32_e32 0, undef %1:vgpr_32, undef %7.sub1, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   SI_SPILL_V64_SAVE %7, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec
+  ; CHECK-NEXT:   dead [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e32 0, [[V_MOV_B32_e32_]].sub1, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MAC_F32_e32_:%[0-9]+]].sub1:vreg_64 = V_MAC_F32_e32 0, undef %1:vgpr_32, undef [[V_MAC_F32_e32_]].sub1, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   SI_SPILL_V64_SAVE [[V_MAC_F32_e32_]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   undef %6.sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec
-  ; CHECK-NEXT:   S_NOP 0, implicit %6.sub1
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_1:%[0-9]+]].sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec
+  ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_1]].sub1
   ; CHECK-NEXT:   [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
   ; CHECK-NEXT:   S_NOP 0, implicit [[SI_SPILL_V64_RESTORE]].sub1
   ; CHECK-NEXT:   S_NOP 0, implicit undef %9.sub0:vreg_64
@@ -59,13 +59,13 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %1.sub2:vreg_128 = V_MOV_B32_e32 1786773504, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub2:vreg_128 = V_MOV_B32_e32 1786773504, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   S_NOP 0, implicit %1.sub2
+  ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub2
   ; CHECK-NEXT:   S_NOP 0, implicit undef %4.sub0:vreg_128
-  ; CHECK-NEXT:   undef %2.sub2:vreg_128 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   S_NOP 0, implicit %2.sub2
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_1:%[0-9]+]].sub2:vreg_128 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_1]].sub2
   bb.0:
     successors: %bb.1
 
diff --git a/llvm/test/CodeGen/AMDGPU/spill-regpressure-less.mir b/llvm/test/CodeGen/AMDGPU/spill-regpressure-less.mir
new file mode 100644
index 000000000000..f50688240fe8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/spill-regpressure-less.mir
@@ -0,0 +1,353 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s
+
+--- |
+  define amdgpu_kernel void @spill_regpressure_less() #0 {
+    ret void
+  }
+
+  attributes #0 = { "amdgpu-waves-per-eu"="8,8" }
+...
+
+---
+name:            spill_regpressure_less
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+  occupancy:       8
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: spill_regpressure_less
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF32:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF33:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF34:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF35:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF36:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF37:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF38:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF39:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF40:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF41:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF42:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF43:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF44:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF45:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF46:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF47:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF48:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF49:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF50:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF51:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF52:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF53:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF54:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF55:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF56:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF57:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF58:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF59:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF60:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF61:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF62:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF63:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF64:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF65:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF66:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]], implicit [[DEF9]], implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]], implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]], implicit [[DEF18]], implicit [[DEF19]], implicit [[DEF20]], implicit [[DEF21]], implicit [[DEF22]], implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]], implicit [[DEF28]], implicit [[DEF29]], implicit [[DEF30]], implicit [[DEF31]], implicit [[DEF32]], implicit [[DEF33]], implicit [[DEF34]], implicit [[DEF35]], implicit [[DEF36]], implicit [[DEF37]], implicit [[DEF38]], implicit [[DEF39]], implicit [[DEF40]], implicit [[DEF41]], implicit [[DEF42]], implicit [[DEF43]], implicit [[DEF44]], implicit [[DEF45]], implicit [[DEF46]], implicit [[DEF47]], implicit [[DEF48]], implicit [[DEF49]], implicit [[DEF50]], implicit [[DEF51]], implicit [[DEF52]], implicit [[DEF53]], implicit [[DEF54]], implicit [[DEF55]], implicit [[DEF56]], implicit [[DEF57]], implicit [[DEF58]], implicit [[DEF59]], implicit [[DEF60]], implicit [[DEF61]], implicit [[DEF62]], implicit [[DEF63]], implicit [[DEF64]], implicit [[DEF65]], implicit [[DEF66]]
+    ; GCN-NEXT: KILL [[DEF]]
+    ; GCN-NEXT: KILL [[DEF1]]
+    ; GCN-NEXT: KILL [[DEF10]]
+    ; GCN-NEXT: KILL [[DEF12]]
+    ; GCN-NEXT: KILL [[DEF13]]
+    ; GCN-NEXT: KILL [[DEF14]]
+    ; GCN-NEXT: KILL [[DEF15]]
+    ; GCN-NEXT: KILL [[DEF16]]
+    ; GCN-NEXT: [[DEF67:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: KILL [[DEF17]]
+    ; GCN-NEXT: [[DEF68:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF69:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+    ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF69]], implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]], implicit [[DEF28]]
+    ; GCN-NEXT: KILL [[DEF2]]
+    ; GCN-NEXT: KILL [[DEF3]]
+    ; GCN-NEXT: KILL [[DEF4]]
+    ; GCN-NEXT: KILL [[DEF5]]
+    ; GCN-NEXT: KILL [[DEF6]]
+    ; GCN-NEXT: KILL [[DEF7]]
+    ; GCN-NEXT: KILL [[DEF8]]
+    ; GCN-NEXT: KILL [[DEF9]]
+    ; GCN-NEXT: KILL [[DEF18]]
+    ; GCN-NEXT: KILL [[DEF19]]
+    ; GCN-NEXT: [[DEF70:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+    ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF70]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]], implicit [[DEF9]]
+    ; GCN-NEXT: KILL [[DEF69]], implicit-def %70, implicit-def %71, implicit-def %72, implicit-def %73, implicit-def %74, implicit-def %75, implicit-def %76, implicit-def %77
+    ; GCN-NEXT: [[DEF71:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF72:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: KILL [[DEF20]]
+    ; GCN-NEXT: [[DEF73:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: KILL [[DEF11]]
+    ; GCN-NEXT: [[DEF74:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: KILL [[DEF21]]
+    ; GCN-NEXT: [[DEF75:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: KILL [[DEF22]]
+    ; GCN-NEXT: [[DEF76:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: KILL [[DEF23]]
+    ; GCN-NEXT: KILL [[DEF24]]
+    ; GCN-NEXT: KILL [[DEF25]]
+    ; GCN-NEXT: KILL [[DEF26]]
+    ; GCN-NEXT: KILL [[DEF27]]
+    ; GCN-NEXT: KILL [[DEF28]]
+    ; GCN-NEXT: KILL [[DEF29]]
+    ; GCN-NEXT: KILL [[DEF30]]
+    ; GCN-NEXT: KILL [[DEF31]]
+    ; GCN-NEXT: KILL [[DEF32]]
+    ; GCN-NEXT: KILL [[DEF33]]
+    ; GCN-NEXT: KILL [[DEF34]]
+    ; GCN-NEXT: KILL [[DEF35]]
+    ; GCN-NEXT: KILL [[DEF36]]
+    ; GCN-NEXT: KILL [[DEF37]]
+    ; GCN-NEXT: KILL [[DEF38]]
+    ; GCN-NEXT: KILL [[DEF39]]
+    ; GCN-NEXT: KILL [[DEF40]]
+    ; GCN-NEXT: KILL [[DEF41]]
+    ; GCN-NEXT: KILL [[DEF42]]
+    ; GCN-NEXT: KILL [[DEF43]]
+    ; GCN-NEXT: KILL [[DEF44]]
+    ; GCN-NEXT: KILL [[DEF45]]
+    ; GCN-NEXT: KILL [[DEF46]]
+    ; GCN-NEXT: KILL [[DEF47]]
+    ; GCN-NEXT: KILL [[DEF48]]
+    ; GCN-NEXT: KILL [[DEF49]]
+    ; GCN-NEXT: KILL [[DEF50]]
+    ; GCN-NEXT: KILL [[DEF51]]
+    ; GCN-NEXT: KILL [[DEF52]]
+    ; GCN-NEXT: KILL [[DEF53]]
+    ; GCN-NEXT: KILL [[DEF54]]
+    ; GCN-NEXT: KILL [[DEF55]]
+    ; GCN-NEXT: KILL [[DEF56]]
+    ; GCN-NEXT: KILL [[DEF57]]
+    ; GCN-NEXT: KILL [[DEF58]]
+    ; GCN-NEXT: KILL [[DEF59]]
+    ; GCN-NEXT: KILL [[DEF60]]
+    ; GCN-NEXT: KILL [[DEF61]]
+    ; GCN-NEXT: KILL [[DEF62]]
+    ; GCN-NEXT: KILL [[DEF63]]
+    ; GCN-NEXT: KILL [[DEF64]]
+    ; GCN-NEXT: KILL [[DEF65]]
+    ; GCN-NEXT: KILL [[DEF66]]
+    ; GCN-NEXT: KILL [[DEF67]]
+    ; GCN-NEXT: KILL [[DEF68]]
+    ; GCN-NEXT: KILL [[DEF71]]
+    ; GCN-NEXT: KILL [[DEF72]]
+    ; GCN-NEXT: KILL [[DEF73]]
+    ; GCN-NEXT: KILL [[DEF74]]
+    ; GCN-NEXT: KILL [[DEF75]]
+    ; GCN-NEXT: KILL [[DEF76]]
+    ; GCN-NEXT: KILL [[DEF70]]
+    ; GCN-NEXT: KILL %70
+    ; GCN-NEXT: KILL %71
+    ; GCN-NEXT: KILL %72
+    ; GCN-NEXT: KILL %73
+    ; GCN-NEXT: KILL %74
+    ; GCN-NEXT: KILL %75
+    ; GCN-NEXT: KILL %76
+    ; GCN-NEXT: KILL %77
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = IMPLICIT_DEF
+    %4:vgpr_32 = IMPLICIT_DEF
+    %5:vgpr_32 = IMPLICIT_DEF
+    %6:vgpr_32 = IMPLICIT_DEF
+    %7:vgpr_32 = IMPLICIT_DEF
+    %8:vgpr_32 = IMPLICIT_DEF
+    %9:vgpr_32 = IMPLICIT_DEF
+    %10:vgpr_32 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vgpr_32 = IMPLICIT_DEF
+    %13:vgpr_32 = IMPLICIT_DEF
+    %14:vgpr_32 = IMPLICIT_DEF
+    %15:vgpr_32 = IMPLICIT_DEF
+    %16:vgpr_32 = IMPLICIT_DEF
+    %17:vgpr_32 = IMPLICIT_DEF
+    %18:vgpr_32 = IMPLICIT_DEF
+    %19:vgpr_32 = IMPLICIT_DEF
+    %20:vgpr_32 = IMPLICIT_DEF
+    %21:vgpr_32 = IMPLICIT_DEF
+    %22:vgpr_32 = IMPLICIT_DEF
+    %23:vgpr_32 = IMPLICIT_DEF
+    %24:vgpr_32 = IMPLICIT_DEF
+    %25:vgpr_32 = IMPLICIT_DEF
+    %26:vgpr_32 = IMPLICIT_DEF
+    %27:vgpr_32 = IMPLICIT_DEF
+    %28:vgpr_32 = IMPLICIT_DEF
+    %29:vgpr_32 = IMPLICIT_DEF
+    %30:vgpr_32 = IMPLICIT_DEF
+    %31:vgpr_32 = IMPLICIT_DEF
+    %32:vgpr_32 = IMPLICIT_DEF
+    %33:vgpr_32 = IMPLICIT_DEF
+    %34:vgpr_32 = IMPLICIT_DEF
+    %35:vgpr_32 = IMPLICIT_DEF
+    %36:vgpr_32 = IMPLICIT_DEF
+    %37:vgpr_32 = IMPLICIT_DEF
+    %38:vgpr_32 = IMPLICIT_DEF
+    %39:vgpr_32 = IMPLICIT_DEF
+    %40:vgpr_32 = IMPLICIT_DEF
+    %41:vgpr_32 = IMPLICIT_DEF
+    %42:vgpr_32 = IMPLICIT_DEF
+    %43:vgpr_32 = IMPLICIT_DEF
+    %44:vgpr_32 = IMPLICIT_DEF
+    %45:vgpr_32 = IMPLICIT_DEF
+    %46:vgpr_32 = IMPLICIT_DEF
+    %47:vgpr_32 = IMPLICIT_DEF
+    %48:vgpr_32 = IMPLICIT_DEF
+    %49:vgpr_32 = IMPLICIT_DEF
+    %50:vgpr_32 = IMPLICIT_DEF
+    %51:vgpr_32 = IMPLICIT_DEF
+    %52:vgpr_32 = IMPLICIT_DEF
+    %53:vgpr_32 = IMPLICIT_DEF
+    %54:vgpr_32 = IMPLICIT_DEF
+    %55:vgpr_32 = IMPLICIT_DEF
+    %56:vgpr_32 = IMPLICIT_DEF
+    %57:vgpr_32 = IMPLICIT_DEF
+    %58:vgpr_32 = IMPLICIT_DEF
+    %59:vgpr_32 = IMPLICIT_DEF
+    %60:vgpr_32 = IMPLICIT_DEF
+    %61:vgpr_32 = IMPLICIT_DEF
+    %62:vgpr_32 = IMPLICIT_DEF
+    %63:vgpr_32 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %65:vgpr_32 = IMPLICIT_DEF
+    %66:vgpr_32 = IMPLICIT_DEF
+    %67:vgpr_32 = IMPLICIT_DEF
+    %68:vgpr_32 = IMPLICIT_DEF
+    INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31, implicit %32, implicit %33, implicit %34, implicit %35, implicit %36, implicit %37, implicit %38, implicit %39, implicit %40, implicit %41, implicit %42, implicit %43, implicit %44, implicit %45, implicit %46, implicit %47, implicit %48, implicit %49, implicit %50, implicit %51, implicit %52, implicit %53, implicit %54, implicit %55, implicit %56, implicit %57, implicit %58, implicit %59, implicit %60, implicit %61, implicit %62, implicit %63, implicit %64, implicit %65, implicit %66
+    %69:sgpr_128 = IMPLICIT_DEF
+    INLINEASM &"", 1, implicit %69, implicit %23, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28
+    KILL %0
+    KILL %1
+    KILL %2
+    KILL %3
+    KILL %4
+    KILL %5
+    KILL %6
+    KILL %7
+    KILL %8
+    KILL %9
+    KILL %10
+    KILL %12
+    KILL %13
+    KILL %14
+    KILL %15
+    KILL %16
+    KILL %17
+    KILL %18
+    KILL %19
+    KILL %69:sgpr_128, implicit-def %77:vgpr_32, implicit-def %78:vgpr_32, implicit-def %79:vgpr_32, implicit-def %80:vgpr_32, implicit-def %81:vgpr_32, implicit-def %82:vgpr_32, implicit-def %83:vgpr_32, implicit-def %84:vgpr_32
+    %70:vgpr_32 = IMPLICIT_DEF
+    %71:vgpr_32 = IMPLICIT_DEF
+    %72:vgpr_32 = IMPLICIT_DEF
+    %73:vgpr_32 = IMPLICIT_DEF
+    %74:vgpr_32 = IMPLICIT_DEF
+    %75:vgpr_32 = IMPLICIT_DEF
+    %76:sgpr_128 = IMPLICIT_DEF
+    INLINEASM &"", 1, implicit %76, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9
+    KILL %20
+    KILL %11
+    KILL %21
+    KILL %22
+    KILL %23
+    KILL %24
+    KILL %25
+    KILL %26
+    KILL %27
+    KILL %28
+    KILL %29
+    KILL %30
+    KILL %31
+    KILL %32
+    KILL %33
+    KILL %34
+    KILL %35
+    KILL %36
+    KILL %37
+    KILL %38
+    KILL %39
+    KILL %40
+    KILL %41
+    KILL %42
+    KILL %43
+    KILL %44
+    KILL %45
+    KILL %46
+    KILL %47
+    KILL %48
+    KILL %49
+    KILL %50
+    KILL %51
+    KILL %52
+    KILL %53
+    KILL %54
+    KILL %55
+    KILL %56
+    KILL %57
+    KILL %58
+    KILL %59
+    KILL %60
+    KILL %61
+    KILL %62
+    KILL %63
+    KILL %64
+    KILL %65
+    KILL %66
+    KILL %67
+    KILL %68
+    KILL %70
+    KILL %71
+    KILL %72
+    KILL %73
+    KILL %74
+    KILL %75
+    KILL %76
+    KILL %77
+    KILL %78
+    KILL %79
+    KILL %80
+    KILL %81
+    KILL %82
+    KILL %83
+    KILL %84
+...
+## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+# GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/split-liverange-overlapping-copies.mir b/llvm/test/CodeGen/AMDGPU/split-liverange-overlapping-copies.mir
index f4cf0f43e456..eca6efaafa9d 100644
--- a/llvm/test/CodeGen/AMDGPU/split-liverange-overlapping-copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/split-liverange-overlapping-copies.mir
@@ -34,33 +34,33 @@ body:             |
   ; CHECK-NEXT:   dead [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit [[DEF1]]
   ; CHECK-NEXT:   S_NOP 0, implicit [[DEF1]]
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %6.sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16:av_1024_align2 = COPY [[COPY]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16 {
-  ; CHECK-NEXT:     internal %6.sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16:av_1024_align2 = COPY [[COPY]].sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16
-  ; CHECK-NEXT:     internal %6.sub29_sub30_sub31:av_1024_align2 = COPY [[COPY]].sub29_sub30_sub31
+  ; CHECK-NEXT:   undef [[COPY1:%[0-9]+]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16:av_1024_align2 = COPY [[COPY]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16 {
+  ; CHECK-NEXT:     internal [[COPY1]].sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16:av_1024_align2 = COPY [[COPY]].sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16
+  ; CHECK-NEXT:     internal [[COPY1]].sub29_sub30_sub31:av_1024_align2 = COPY [[COPY]].sub29_sub30_sub31
   ; CHECK-NEXT:   }
-  ; CHECK-NEXT:   %6.sub0:av_1024_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   S_NOP 0, implicit %6.sub0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]].sub0:av_1024_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit [[COPY1]].sub0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_NOP 0, implicit %6
+  ; CHECK-NEXT:   S_NOP 0, implicit [[COPY1]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:av_1024_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:av_1024_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit undef $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   undef %4.sub0:vreg_1024_align2 = COPY [[DEF]]
-  ; CHECK-NEXT:   S_NOP 0, implicit %4
+  ; CHECK-NEXT:   undef [[COPY2:%[0-9]+]].sub0:vreg_1024_align2 = COPY [[DEF]]
+  ; CHECK-NEXT:   S_NOP 0, implicit [[COPY2]]
   bb.0:
     %0:vgpr_32 = IMPLICIT_DEF
     %1:vreg_1024_align2 = IMPLICIT_DEF
@@ -110,34 +110,34 @@ body:             |
   ; CHECK-NEXT:   dead [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit [[DEF1]]
   ; CHECK-NEXT:   S_NOP 0, implicit [[DEF1]]
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %6.sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16:av_1024 = COPY [[COPY]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16 {
-  ; CHECK-NEXT:     internal %6.sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16:av_1024 = COPY [[COPY]].sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16
-  ; CHECK-NEXT:     internal %6.sub29_sub30:av_1024 = COPY [[COPY]].sub29_sub30
+  ; CHECK-NEXT:   undef [[COPY1:%[0-9]+]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16:av_1024 = COPY [[COPY]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16 {
+  ; CHECK-NEXT:     internal [[COPY1]].sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16:av_1024 = COPY [[COPY]].sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16
+  ; CHECK-NEXT:     internal [[COPY1]].sub29_sub30:av_1024 = COPY [[COPY]].sub29_sub30
   ; CHECK-NEXT:   }
-  ; CHECK-NEXT:   %6.sub0:av_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   %6.sub31:av_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   S_NOP 0, implicit %6.sub0, implicit %6.sub31
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]].sub0:av_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]].sub31:av_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit [[COPY1]].sub0, implicit [[COPY1]].sub31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_NOP 0, implicit %6
+  ; CHECK-NEXT:   S_NOP 0, implicit [[COPY1]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:av_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:av_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit undef $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   undef %4.sub0:vreg_1024 = COPY [[DEF]]
-  ; CHECK-NEXT:   S_NOP 0, implicit %4
+  ; CHECK-NEXT:   undef [[COPY2:%[0-9]+]].sub0:vreg_1024 = COPY [[DEF]]
+  ; CHECK-NEXT:   S_NOP 0, implicit [[COPY2]]
   bb.0:
     %0:vgpr_32 = IMPLICIT_DEF
     %1:vreg_1024 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/split-mbb-lis-subrange.mir b/llvm/test/CodeGen/AMDGPU/split-mbb-lis-subrange.mir
index 2c4178d242ca..896986ff9b02 100644
--- a/llvm/test/CodeGen/AMDGPU/split-mbb-lis-subrange.mir
+++ b/llvm/test/CodeGen/AMDGPU/split-mbb-lis-subrange.mir
@@ -34,8 +34,8 @@ body:             |
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   %s0c:vgpr_32 = V_ADD_F32_e64 0, %s0a, 0, %const, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   %s0d:vgpr_32 = V_ADD_F32_e64 0, %s0b, 0, %const, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY %s0c
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY %s0d
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY %s0c
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY %s0d
   ; GCN-NEXT:   S_BRANCH %bb.3
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
@@ -45,8 +45,8 @@ body:             |
   ; GCN-NEXT: bb.3:
   ; GCN-NEXT:   successors: %bb.4(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   %phi1:vgpr_32 = COPY [[COPY3]]
-  ; GCN-NEXT:   %phi0:vgpr_32 = COPY [[COPY2]]
+  ; GCN-NEXT:   %phi1:vgpr_32 = COPY [[COPY1]]
+  ; GCN-NEXT:   %phi0:vgpr_32 = COPY [[COPY]]
   ; GCN-NEXT:   S_BRANCH %bb.4
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.4:
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir
index b50d71cb79e9..8f53ec2f992d 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir
@@ -15,72 +15,72 @@ body:             |
   ; RA-NEXT: {{  $}}
   ; RA-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; RA-NEXT:   [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; RA-NEXT:   undef %2.sub1:sgpr_1024 = S_MOV_B32 -1
-  ; RA-NEXT:   %2.sub0:sgpr_1024 = S_MOV_B32 -1
-  ; RA-NEXT:   undef %3.sub0:sgpr_1024 = S_MOV_B32 0
+  ; RA-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_1024 = S_MOV_B32 -1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_1024 = S_MOV_B32 -1
+  ; RA-NEXT:   undef [[S_MOV_B32_1:%[0-9]+]].sub0:sgpr_1024 = S_MOV_B32 0
   ; RA-NEXT: {{  $}}
   ; RA-NEXT: bb.1:
   ; RA-NEXT:   successors: %bb.2(0x80000000)
   ; RA-NEXT: {{  $}}
-  ; RA-NEXT:   %2.sub2:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub3:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub4:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub5:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub6:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub7:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub8:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub9:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub10:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub11:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub12:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub13:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub14:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub15:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub16:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub17:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub18:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub19:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub20:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub21:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub22:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub23:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub24:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub25:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub26:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub27:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub28:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub29:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %3.sub1:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub2:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub3:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub4:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub5:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub6:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub7:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub8:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub9:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub10:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub11:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub12:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub13:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub14:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub15:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub16:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub17:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub18:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub19:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub20:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub21:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub22:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub23:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub24:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub25:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub26:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub27:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub28:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub29:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub30:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub31:sgpr_1024 = COPY %3.sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub4:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub5:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub6:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub7:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub8:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub9:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub10:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub11:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub12:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub13:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub14:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub15:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub16:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub17:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub18:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub19:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub20:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub21:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub22:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub23:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub24:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub25:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub26:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub27:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub28:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub29:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub1:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub2:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub3:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub4:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub5:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub6:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub7:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub8:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub9:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub10:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub11:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub12:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub13:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub14:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub15:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub16:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub17:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub18:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub19:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub20:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub21:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub22:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub23:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub24:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub25:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub26:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub27:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub28:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub29:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub30:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub31:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
   ; RA-NEXT: {{  $}}
   ; RA-NEXT: bb.2:
   ; RA-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
@@ -257,53 +257,53 @@ body:             |
     ; RA: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
     ; RA-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
     ; RA-NEXT: [[DEF2:%[0-9]+]]:sgpr_512 = IMPLICIT_DEF
-    ; RA-NEXT: [[DEF2]].sub4:sgpr_512 = S_MOV_B32 -1
-    ; RA-NEXT: [[DEF2]].sub5:sgpr_512 = S_MOV_B32 -1
-    ; RA-NEXT: [[DEF2]].sub10:sgpr_512 = S_MOV_B32 -1
-    ; RA-NEXT: [[DEF2]].sub11:sgpr_512 = S_MOV_B32 -1
-    ; RA-NEXT: [[DEF2]].sub7:sgpr_512 = S_MOV_B32 -1
-    ; RA-NEXT: [[DEF2]].sub8:sgpr_512 = S_MOV_B32 -1
-    ; RA-NEXT: [[DEF2]].sub13:sgpr_512 = S_MOV_B32 -1
-    ; RA-NEXT: [[DEF2]].sub14:sgpr_512 = S_MOV_B32 -1
-    ; RA-NEXT: undef %16.sub4_sub5:sgpr_512 = COPY [[DEF2]].sub4_sub5 {
-    ; RA-NEXT:   internal %16.sub10_sub11:sgpr_512 = COPY [[DEF2]].sub10_sub11
-    ; RA-NEXT:   internal %16.sub7:sgpr_512 = COPY [[DEF2]].sub7
-    ; RA-NEXT:   internal %16.sub8:sgpr_512 = COPY [[DEF2]].sub8
-    ; RA-NEXT:   internal %16.sub13:sgpr_512 = COPY [[DEF2]].sub13
-    ; RA-NEXT:   internal %16.sub14:sgpr_512 = COPY [[DEF2]].sub14
+    ; RA-NEXT: [[DEF2:%[0-9]+]].sub4:sgpr_512 = S_MOV_B32 -1
+    ; RA-NEXT: [[DEF2:%[0-9]+]].sub5:sgpr_512 = S_MOV_B32 -1
+    ; RA-NEXT: [[DEF2:%[0-9]+]].sub10:sgpr_512 = S_MOV_B32 -1
+    ; RA-NEXT: [[DEF2:%[0-9]+]].sub11:sgpr_512 = S_MOV_B32 -1
+    ; RA-NEXT: [[DEF2:%[0-9]+]].sub7:sgpr_512 = S_MOV_B32 -1
+    ; RA-NEXT: [[DEF2:%[0-9]+]].sub8:sgpr_512 = S_MOV_B32 -1
+    ; RA-NEXT: [[DEF2:%[0-9]+]].sub13:sgpr_512 = S_MOV_B32 -1
+    ; RA-NEXT: [[DEF2:%[0-9]+]].sub14:sgpr_512 = S_MOV_B32 -1
+    ; RA-NEXT: undef [[COPY:%[0-9]+]].sub4_sub5:sgpr_512 = COPY [[DEF2]].sub4_sub5 {
+    ; RA-NEXT:   internal [[COPY]].sub10_sub11:sgpr_512 = COPY [[DEF2]].sub10_sub11
+    ; RA-NEXT:   internal [[COPY]].sub7:sgpr_512 = COPY [[DEF2]].sub7
+    ; RA-NEXT:   internal [[COPY]].sub8:sgpr_512 = COPY [[DEF2]].sub8
+    ; RA-NEXT:   internal [[COPY]].sub13:sgpr_512 = COPY [[DEF2]].sub13
+    ; RA-NEXT:   internal [[COPY]].sub14:sgpr_512 = COPY [[DEF2]].sub14
     ; RA-NEXT: }
-    ; RA-NEXT: undef %18.sub4_sub5:sgpr_512 = COPY %16.sub4_sub5 {
-    ; RA-NEXT:   internal %18.sub10_sub11:sgpr_512 = COPY %16.sub10_sub11
-    ; RA-NEXT:   internal %18.sub7:sgpr_512 = COPY %16.sub7
-    ; RA-NEXT:   internal %18.sub8:sgpr_512 = COPY %16.sub8
-    ; RA-NEXT:   internal %18.sub13:sgpr_512 = COPY %16.sub13
-    ; RA-NEXT:   internal %18.sub14:sgpr_512 = COPY %16.sub14
+    ; RA-NEXT: undef [[COPY1:%[0-9]+]].sub4_sub5:sgpr_512 = COPY [[COPY]].sub4_sub5 {
+    ; RA-NEXT:   internal [[COPY1]].sub10_sub11:sgpr_512 = COPY [[COPY]].sub10_sub11
+    ; RA-NEXT:   internal [[COPY1]].sub7:sgpr_512 = COPY [[COPY]].sub7
+    ; RA-NEXT:   internal [[COPY1]].sub8:sgpr_512 = COPY [[COPY]].sub8
+    ; RA-NEXT:   internal [[COPY1]].sub13:sgpr_512 = COPY [[COPY]].sub13
+    ; RA-NEXT:   internal [[COPY1]].sub14:sgpr_512 = COPY [[COPY]].sub14
     ; RA-NEXT: }
-    ; RA-NEXT: SI_SPILL_S512_SAVE %18, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s512) into %stack.0, align 4, addrspace 5)
+    ; RA-NEXT: SI_SPILL_S512_SAVE [[COPY1]], %stack.0, implicit $exec, implicit $sgpr32 :: (store (s512) into %stack.0, align 4, addrspace 5)
     ; RA-NEXT: S_NOP 0, implicit-def $sgpr8, implicit-def $sgpr12, implicit-def $sgpr16, implicit-def $sgpr20, implicit-def $sgpr24, implicit-def $sgpr28, implicit-def $sgpr32, implicit-def $sgpr36, implicit-def $sgpr40, implicit-def $sgpr44, implicit-def $sgpr48, implicit-def $sgpr52, implicit-def $sgpr56, implicit-def $sgpr60, implicit-def $sgpr64, implicit-def $sgpr68, implicit-def $sgpr72, implicit-def $sgpr74, implicit-def $sgpr78, implicit-def $sgpr82, implicit-def $sgpr86, implicit-def $sgpr90, implicit-def $sgpr94, implicit-def $sgpr98
     ; RA-NEXT: [[SI_SPILL_S512_RESTORE:%[0-9]+]]:sgpr_512 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s512) from %stack.0, align 4, addrspace 5)
-    ; RA-NEXT: undef %17.sub4_sub5:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub4_sub5 {
-    ; RA-NEXT:   internal %17.sub10_sub11:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub10_sub11
-    ; RA-NEXT:   internal %17.sub7:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub7
-    ; RA-NEXT:   internal %17.sub8:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub8
-    ; RA-NEXT:   internal %17.sub13:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub13
-    ; RA-NEXT:   internal %17.sub14:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub14
+    ; RA-NEXT: undef [[COPY2:%[0-9]+]].sub4_sub5:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub4_sub5 {
+    ; RA-NEXT:   internal [[COPY2]].sub10_sub11:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub10_sub11
+    ; RA-NEXT:   internal [[COPY2]].sub7:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub7
+    ; RA-NEXT:   internal [[COPY2]].sub8:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub8
+    ; RA-NEXT:   internal [[COPY2]].sub13:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub13
+    ; RA-NEXT:   internal [[COPY2]].sub14:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub14
     ; RA-NEXT: }
-    ; RA-NEXT: undef %14.sub4_sub5:sgpr_512 = COPY %17.sub4_sub5 {
-    ; RA-NEXT:   internal %14.sub10_sub11:sgpr_512 = COPY %17.sub10_sub11
-    ; RA-NEXT:   internal %14.sub7:sgpr_512 = COPY %17.sub7
-    ; RA-NEXT:   internal %14.sub8:sgpr_512 = COPY %17.sub8
-    ; RA-NEXT:   internal %14.sub13:sgpr_512 = COPY %17.sub13
-    ; RA-NEXT:   internal %14.sub14:sgpr_512 = COPY %17.sub14
+    ; RA-NEXT: undef [[COPY3:%[0-9]+]].sub4_sub5:sgpr_512 = COPY [[COPY2]].sub4_sub5 {
+    ; RA-NEXT:   internal [[COPY3]].sub10_sub11:sgpr_512 = COPY [[COPY2]].sub10_sub11
+    ; RA-NEXT:   internal [[COPY3]].sub7:sgpr_512 = COPY [[COPY2]].sub7
+    ; RA-NEXT:   internal [[COPY3]].sub8:sgpr_512 = COPY [[COPY2]].sub8
+    ; RA-NEXT:   internal [[COPY3]].sub13:sgpr_512 = COPY [[COPY2]].sub13
+    ; RA-NEXT:   internal [[COPY3]].sub14:sgpr_512 = COPY [[COPY2]].sub14
     ; RA-NEXT: }
-    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], %14.sub4, 0 :: (dereferenceable invariant load (s32))
-    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], %14.sub5, 0 :: (dereferenceable invariant load (s32))
-    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], %14.sub10, 0 :: (dereferenceable invariant load (s32))
-    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], %14.sub11, 0 :: (dereferenceable invariant load (s32))
-    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], %14.sub7, 0 :: (dereferenceable invariant load (s32))
-    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], %14.sub8, 0 :: (dereferenceable invariant load (s32))
-    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], %14.sub13, 0 :: (dereferenceable invariant load (s32))
-    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], %14.sub14, 0 :: (dereferenceable invariant load (s32))
+    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], [[COPY3]].sub4, 0 :: (dereferenceable invariant load (s32))
+    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], [[COPY3]].sub5, 0 :: (dereferenceable invariant load (s32))
+    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], [[COPY3]].sub10, 0 :: (dereferenceable invariant load (s32))
+    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], [[COPY3]].sub11, 0 :: (dereferenceable invariant load (s32))
+    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], [[COPY3]].sub7, 0 :: (dereferenceable invariant load (s32))
+    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], [[COPY3]].sub8, 0 :: (dereferenceable invariant load (s32))
+    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], [[COPY3]].sub13, 0 :: (dereferenceable invariant load (s32))
+    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], [[COPY3]].sub14, 0 :: (dereferenceable invariant load (s32))
     ; RA-NEXT: S_NOP 0, implicit [[DEF]], implicit [[DEF1]], implicit [[S_BUFFER_LOAD_DWORD_SGPR]], implicit [[S_BUFFER_LOAD_DWORD_SGPR1]], implicit [[S_BUFFER_LOAD_DWORD_SGPR2]], implicit [[S_BUFFER_LOAD_DWORD_SGPR3]], implicit [[S_BUFFER_LOAD_DWORD_SGPR4]], implicit [[S_BUFFER_LOAD_DWORD_SGPR5]], implicit [[S_BUFFER_LOAD_DWORD_SGPR6]], implicit [[S_BUFFER_LOAD_DWORD_SGPR7]]
     ;
     ; VR-LABEL: name: splitkit_copy_unbundle_reorder
@@ -349,7 +349,6 @@ body:             |
     %2.sub13:sgpr_512 = S_MOV_B32 -1
     %2.sub14:sgpr_512 = S_MOV_B32 -1
 
-    ; Clobber registers
     S_NOP 0, implicit-def $sgpr8, implicit-def $sgpr12, implicit-def $sgpr16, implicit-def $sgpr20, implicit-def $sgpr24, implicit-def $sgpr28, implicit-def $sgpr32, implicit-def $sgpr36, implicit-def $sgpr40, implicit-def $sgpr44, implicit-def $sgpr48, implicit-def $sgpr52, implicit-def $sgpr56, implicit-def $sgpr60, implicit-def $sgpr64, implicit-def $sgpr68, implicit-def $sgpr72, implicit-def $sgpr74, implicit-def $sgpr78, implicit-def $sgpr82, implicit-def $sgpr86, implicit-def $sgpr90, implicit-def $sgpr94, implicit-def $sgpr98
 
     %5:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %0:sgpr_128, %2.sub4:sgpr_512, 0 :: (dereferenceable invariant load (s32))
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir b/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
index 70e5e8a00c3d..42db92b15acf 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
@@ -16,449 +16,449 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
     ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128), align 4, addrspace 4)
-    ; CHECK-NEXT: undef %2.sub3:sgpr_128 = S_MOV_B32 61440
-    ; CHECK-NEXT: %2.sub2:sgpr_128 = S_MOV_B32 -1
-    ; CHECK-NEXT: %2.sub0:sgpr_128 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-    ; CHECK-NEXT: %2.sub1:sgpr_128 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-    ; CHECK-NEXT: undef %3.sub0:sgpr_128 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-    ; CHECK-NEXT: %3.sub1:sgpr_128 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-    ; CHECK-NEXT: %3.sub2:sgpr_128 = COPY %2.sub2
-    ; CHECK-NEXT: %3.sub3:sgpr_128 = COPY %2.sub3
-    ; CHECK-NEXT: early-clobber %4:vreg_128, early-clobber %5:vreg_128, early-clobber %6:vreg_128, early-clobber %7:vreg_128 = BUNDLE %3, implicit $exec {
-    ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 0, 0, 0, implicit $exec :: (load (s128), align 128, addrspace 1)
-    ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 16, 0, 0, implicit $exec :: (load (s128), addrspace 1)
-    ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 32, 0, 0, implicit $exec :: (load (s128), align 32, addrspace 1)
-    ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 48, 0, 0, implicit $exec :: (load (s128), addrspace 1)
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %47.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %55.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %63.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %71.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2, implicit $exec
-    ; CHECK-NEXT: undef %79.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %87.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %95.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %101.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec
-    ; CHECK-NEXT: undef %107.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %113.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %154.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %209.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec
-    ; CHECK-NEXT: undef %188.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %123.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %129.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %135.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec
-    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 64, 0, 0, implicit $exec :: (load (s128), align 64, addrspace 1)
-    ; CHECK-NEXT: undef %141.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %147.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %159.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %165.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec
-    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 80, 0, 0, implicit $exec :: (load (s128), addrspace 1)
-    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 96, 0, 0, implicit $exec :: (load (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: undef %36.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %37.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %38.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub2, implicit $exec
-    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 112, 0, 0, implicit $exec :: (load (s128), addrspace 1)
-    ; CHECK-NEXT: undef %40.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %41.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %42.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %43.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub2, implicit $exec
-    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
-    ; CHECK-NEXT: undef %48.sub2:vreg_128 = COPY %47.sub2
-    ; CHECK-NEXT: %48.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %51.sub0:vreg_128 = COPY %48.sub0 {
-    ; CHECK-NEXT:   internal %51.sub2:vreg_128 = COPY %48.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %51, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %56.sub2:vreg_128 = COPY %55.sub2
-    ; CHECK-NEXT: %56.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %59.sub0:vreg_128 = COPY %56.sub0 {
-    ; CHECK-NEXT:   internal %59.sub2:vreg_128 = COPY %56.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %59, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %64.sub2:vreg_128 = COPY %63.sub2
-    ; CHECK-NEXT: %64.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %67.sub0:vreg_128 = COPY %64.sub0 {
-    ; CHECK-NEXT:   internal %67.sub2:vreg_128 = COPY %64.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %67, %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %72.sub2:vreg_128 = COPY %71.sub2
-    ; CHECK-NEXT: %72.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2, implicit $exec
-    ; CHECK-NEXT: undef %75.sub0:vreg_128 = COPY %72.sub0 {
-    ; CHECK-NEXT:   internal %75.sub2:vreg_128 = COPY %72.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %75, %stack.3, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.3, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %80.sub2:vreg_128 = COPY %79.sub2
-    ; CHECK-NEXT: %80.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %83.sub0:vreg_128 = COPY %80.sub0 {
-    ; CHECK-NEXT:   internal %83.sub2:vreg_128 = COPY %80.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %83, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %88.sub2:vreg_128 = COPY %87.sub2
-    ; CHECK-NEXT: %88.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %91.sub0:vreg_128 = COPY %88.sub0 {
-    ; CHECK-NEXT:   internal %91.sub2:vreg_128 = COPY %88.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %91, %stack.5, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.5, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %96.sub2:vreg_128 = COPY %95.sub2
-    ; CHECK-NEXT: %96.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %155.sub0:vreg_128 = COPY %96.sub0 {
-    ; CHECK-NEXT:   internal %155.sub2:vreg_128 = COPY %96.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %155, %stack.7, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.7, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %102.sub2:vreg_128 = COPY %101.sub2
-    ; CHECK-NEXT: %102.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec
-    ; CHECK-NEXT: undef %117.sub0:vreg_128 = COPY %102.sub0 {
-    ; CHECK-NEXT:   internal %117.sub2:vreg_128 = COPY %102.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %117, %stack.6, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.6, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %108.sub2:vreg_128 = COPY %107.sub2
-    ; CHECK-NEXT: %108.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %110.sub0:vreg_128 = COPY %108.sub0 {
-    ; CHECK-NEXT:   internal %110.sub2:vreg_128 = COPY %108.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %114.sub2:vreg_128 = COPY %113.sub2
-    ; CHECK-NEXT: %114.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %116.sub0:vreg_128 = COPY %114.sub0 {
-    ; CHECK-NEXT:   internal %116.sub2:vreg_128 = COPY %114.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %154.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %177.sub0:vreg_128 = COPY %154.sub0 {
-    ; CHECK-NEXT:   internal %177.sub2:vreg_128 = COPY %154.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %179.sub0:vreg_128 = COPY %177.sub0 {
-    ; CHECK-NEXT:   internal %179.sub2:vreg_128 = COPY %177.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %179, %stack.8, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.8, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %210.sub2:vreg_128 = COPY %209.sub2
-    ; CHECK-NEXT: %210.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec
-    ; CHECK-NEXT: undef %213.sub0:vreg_128 = COPY %210.sub0 {
-    ; CHECK-NEXT:   internal %213.sub2:vreg_128 = COPY %210.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %213, %stack.11, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.11, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %189.sub2:vreg_128 = COPY %188.sub2
-    ; CHECK-NEXT: %189.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %192.sub0:vreg_128 = COPY %189.sub0 {
-    ; CHECK-NEXT:   internal %192.sub2:vreg_128 = COPY %189.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %192, %stack.9, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.9, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %124.sub2:vreg_128 = COPY %123.sub2
-    ; CHECK-NEXT: %124.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %126.sub0:vreg_128 = COPY %124.sub0 {
-    ; CHECK-NEXT:   internal %126.sub2:vreg_128 = COPY %124.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %130.sub2:vreg_128 = COPY %129.sub2
-    ; CHECK-NEXT: %130.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %205.sub0:vreg_128 = COPY %130.sub0 {
-    ; CHECK-NEXT:   internal %205.sub2:vreg_128 = COPY %130.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %205, %stack.10, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.10, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %136.sub2:vreg_128 = COPY %135.sub2
-    ; CHECK-NEXT: %136.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec
-    ; CHECK-NEXT: undef %138.sub0:vreg_128 = COPY %136.sub0 {
-    ; CHECK-NEXT:   internal %138.sub2:vreg_128 = COPY %136.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %142.sub2:vreg_128 = COPY %141.sub2
-    ; CHECK-NEXT: %142.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %144.sub0:vreg_128 = COPY %142.sub0 {
-    ; CHECK-NEXT:   internal %144.sub2:vreg_128 = COPY %142.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %148.sub2:vreg_128 = COPY %147.sub2
-    ; CHECK-NEXT: %148.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %150.sub0:vreg_128 = COPY %148.sub0 {
-    ; CHECK-NEXT:   internal %150.sub2:vreg_128 = COPY %148.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %160.sub2:vreg_128 = COPY %159.sub2
-    ; CHECK-NEXT: %160.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %162.sub0:vreg_128 = COPY %160.sub0 {
-    ; CHECK-NEXT:   internal %162.sub2:vreg_128 = COPY %160.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %166.sub2:vreg_128 = COPY %165.sub2
-    ; CHECK-NEXT: %166.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec
-    ; CHECK-NEXT: undef %168.sub0:vreg_128 = COPY %166.sub0 {
-    ; CHECK-NEXT:   internal %168.sub2:vreg_128 = COPY %166.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %175.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %172.sub2:vreg_128 = COPY %175.sub2
-    ; CHECK-NEXT: %172.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %174.sub0:vreg_128 = COPY %172.sub0 {
-    ; CHECK-NEXT:   internal %174.sub2:vreg_128 = COPY %172.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %187.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %184.sub2:vreg_128 = COPY %187.sub2
-    ; CHECK-NEXT: %184.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %186.sub0:vreg_128 = COPY %184.sub0 {
-    ; CHECK-NEXT:   internal %186.sub2:vreg_128 = COPY %184.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %200.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %197.sub2:vreg_128 = COPY %200.sub2
-    ; CHECK-NEXT: %197.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %199.sub0:vreg_128 = COPY %197.sub0 {
-    ; CHECK-NEXT:   internal %199.sub2:vreg_128 = COPY %197.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %220.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec
-    ; CHECK-NEXT: undef %204.sub2:vreg_128 = COPY %220.sub2
-    ; CHECK-NEXT: %204.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec
-    ; CHECK-NEXT: undef %219.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %218.sub2:vreg_128 = COPY %219.sub2
-    ; CHECK-NEXT: %218.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec
-    ; CHECK-NEXT: %36.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub0, implicit $exec
-    ; CHECK-NEXT: %37.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub3, implicit $exec
-    ; CHECK-NEXT: %38.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub2, implicit $exec
-    ; CHECK-NEXT: %40.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub1, implicit $exec
-    ; CHECK-NEXT: %41.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub0, implicit $exec
-    ; CHECK-NEXT: %42.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub3, implicit $exec
-    ; CHECK-NEXT: %43.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub2, implicit $exec
-    ; CHECK-NEXT: %43.sub1:vreg_128 = V_MOV_B32_e32 0, implicit $exec
-    ; CHECK-NEXT: %43.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %43, %2, 0, 480, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: %42.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %42.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %42, %2, 0, 496, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: %41.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %41.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %41, %2, 0, 448, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
-    ; CHECK-NEXT: %40.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %40.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %40, %2, 0, 464, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: %38.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %38.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %38, %2, 0, 416, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: %37.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %37.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %37, %2, 0, 432, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: %36.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %36.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %36, %2, 0, 384, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
-    ; CHECK-NEXT: undef %216.sub0:vreg_128 = COPY %218.sub0 {
-    ; CHECK-NEXT:   internal %216.sub2:vreg_128 = COPY %218.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %216.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %216.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %216, %2, 0, 400, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef %202.sub0:vreg_128 = COPY %204.sub0 {
-    ; CHECK-NEXT:   internal %202.sub2:vreg_128 = COPY %204.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %202.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %202.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %202, %2, 0, 352, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: undef %198.sub0:vreg_128 = COPY %199.sub0 {
-    ; CHECK-NEXT:   internal %198.sub2:vreg_128 = COPY %199.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %195.sub0:vreg_128 = COPY %198.sub0 {
-    ; CHECK-NEXT:   internal %195.sub2:vreg_128 = COPY %198.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %195.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %195.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %195, %2, 0, 368, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef %185.sub0:vreg_128 = COPY %186.sub0 {
-    ; CHECK-NEXT:   internal %185.sub2:vreg_128 = COPY %186.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %182.sub0:vreg_128 = COPY %185.sub0 {
-    ; CHECK-NEXT:   internal %182.sub2:vreg_128 = COPY %185.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %182.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %182.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %182, %2, 0, 320, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
-    ; CHECK-NEXT: undef %173.sub0:vreg_128 = COPY %174.sub0 {
-    ; CHECK-NEXT:   internal %173.sub2:vreg_128 = COPY %174.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %170.sub0:vreg_128 = COPY %173.sub0 {
-    ; CHECK-NEXT:   internal %170.sub2:vreg_128 = COPY %173.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %170.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %170.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %170, %2, 0, 336, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef %167.sub0:vreg_128 = COPY %168.sub0 {
-    ; CHECK-NEXT:   internal %167.sub2:vreg_128 = COPY %168.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %164.sub0:vreg_128 = COPY %167.sub0 {
-    ; CHECK-NEXT:   internal %164.sub2:vreg_128 = COPY %167.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %164.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %164.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %164, %2, 0, 288, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: undef %161.sub0:vreg_128 = COPY %162.sub0 {
-    ; CHECK-NEXT:   internal %161.sub2:vreg_128 = COPY %162.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %158.sub0:vreg_128 = COPY %161.sub0 {
-    ; CHECK-NEXT:   internal %158.sub2:vreg_128 = COPY %161.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %158.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %158.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %158, %2, 0, 304, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef %149.sub0:vreg_128 = COPY %150.sub0 {
-    ; CHECK-NEXT:   internal %149.sub2:vreg_128 = COPY %150.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %146.sub0:vreg_128 = COPY %149.sub0 {
-    ; CHECK-NEXT:   internal %146.sub2:vreg_128 = COPY %149.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %146.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %146.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %146, %2, 0, 256, 0, 0, implicit $exec :: (store (s128), align 256, addrspace 1)
-    ; CHECK-NEXT: undef %143.sub0:vreg_128 = COPY %144.sub0 {
-    ; CHECK-NEXT:   internal %143.sub2:vreg_128 = COPY %144.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %140.sub0:vreg_128 = COPY %143.sub0 {
-    ; CHECK-NEXT:   internal %140.sub2:vreg_128 = COPY %143.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %140.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %140.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %140, %2, 0, 272, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef %137.sub0:vreg_128 = COPY %138.sub0 {
-    ; CHECK-NEXT:   internal %137.sub2:vreg_128 = COPY %138.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %134.sub0:vreg_128 = COPY %137.sub0 {
-    ; CHECK-NEXT:   internal %134.sub2:vreg_128 = COPY %137.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %134.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %134.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %134, %2, 0, 224, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 61440
+    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1
+    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:sgpr_128 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub2
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub3
+    ; CHECK-NEXT: early-clobber %4:vreg_128, early-clobber %5:vreg_128, early-clobber %6:vreg_128, early-clobber %7:vreg_128 = BUNDLE [[COPY1]], implicit $exec {
+    ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load (s128), align 128, addrspace 1)
+    ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[COPY1]], 0, 16, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+    ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[COPY1]], 0, 32, 0, 0, implicit $exec :: (load (s128), align 32, addrspace 1)
+    ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[COPY1]], 0, 48, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_1:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_2:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_3:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_4:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_5:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_6:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_7:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_8:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_9:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_10:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_11:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_12:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_13:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_14:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_15:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec
+    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[COPY1]], 0, 64, 0, 0, implicit $exec :: (load (s128), align 64, addrspace 1)
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_16:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_17:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_18:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_19:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec
+    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[COPY1]], 0, 80, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[COPY1]], 0, 96, 0, 0, implicit $exec :: (load (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_20:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_21:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_22:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub2, implicit $exec
+    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[COPY1]], 0, 112, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_23:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_24:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_25:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_26:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub2, implicit $exec
+    ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_]].sub2
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_128 = COPY [[COPY2]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY3]].sub2:vreg_128 = COPY [[COPY2]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY3]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY4:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_1]].sub2
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[COPY5:%[0-9]+]].sub0:vreg_128 = COPY [[COPY4]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY5]].sub2:vreg_128 = COPY [[COPY4]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY5]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY6:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_2]].sub2
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[COPY7:%[0-9]+]].sub0:vreg_128 = COPY [[COPY6]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY7]].sub2:vreg_128 = COPY [[COPY6]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY7]], %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY8:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_3]].sub2
+    ; CHECK-NEXT: [[COPY8:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2, implicit $exec
+    ; CHECK-NEXT: undef [[COPY9:%[0-9]+]].sub0:vreg_128 = COPY [[COPY8]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY9]].sub2:vreg_128 = COPY [[COPY8]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY9]], %stack.3, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.3, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY10:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_4]].sub2
+    ; CHECK-NEXT: [[COPY10:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[COPY11:%[0-9]+]].sub0:vreg_128 = COPY [[COPY10]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY11]].sub2:vreg_128 = COPY [[COPY10]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY11]], %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY12:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_5]].sub2
+    ; CHECK-NEXT: [[COPY12:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[COPY13:%[0-9]+]].sub0:vreg_128 = COPY [[COPY12]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY13]].sub2:vreg_128 = COPY [[COPY12]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY13]], %stack.5, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.5, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY14:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_6]].sub2
+    ; CHECK-NEXT: [[COPY14:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[COPY15:%[0-9]+]].sub0:vreg_128 = COPY [[COPY14]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY15]].sub2:vreg_128 = COPY [[COPY14]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY15]], %stack.7, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.7, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY16:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_7]].sub2
+    ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec
+    ; CHECK-NEXT: undef [[COPY17:%[0-9]+]].sub0:vreg_128 = COPY [[COPY16]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY17]].sub2:vreg_128 = COPY [[COPY16]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY17]], %stack.6, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.6, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY18:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_8]].sub2
+    ; CHECK-NEXT: [[COPY18:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[COPY19:%[0-9]+]].sub0:vreg_128 = COPY [[COPY18]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY19]].sub2:vreg_128 = COPY [[COPY18]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY20:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_9]].sub2
+    ; CHECK-NEXT: [[COPY20:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[COPY21:%[0-9]+]].sub0:vreg_128 = COPY [[COPY20]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY21]].sub2:vreg_128 = COPY [[COPY20]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_10:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[COPY22:%[0-9]+]].sub0:vreg_128 = COPY [[V_LSHRREV_B32_e32_10]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY22]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_10]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY23:%[0-9]+]].sub0:vreg_128 = COPY [[COPY22]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY23]].sub2:vreg_128 = COPY [[COPY22]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY23]], %stack.8, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.8, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY24:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_11]].sub2
+    ; CHECK-NEXT: [[COPY24:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec
+    ; CHECK-NEXT: undef [[COPY25:%[0-9]+]].sub0:vreg_128 = COPY [[COPY24]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY25]].sub2:vreg_128 = COPY [[COPY24]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY25]], %stack.11, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.11, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY26:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_12]].sub2
+    ; CHECK-NEXT: [[COPY26:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[COPY27:%[0-9]+]].sub0:vreg_128 = COPY [[COPY26]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY27]].sub2:vreg_128 = COPY [[COPY26]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY27]], %stack.9, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.9, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY28:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_13]].sub2
+    ; CHECK-NEXT: [[COPY28:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[COPY29:%[0-9]+]].sub0:vreg_128 = COPY [[COPY28]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY29]].sub2:vreg_128 = COPY [[COPY28]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY30:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_14]].sub2
+    ; CHECK-NEXT: [[COPY30:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[COPY31:%[0-9]+]].sub0:vreg_128 = COPY [[COPY30]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY31]].sub2:vreg_128 = COPY [[COPY30]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY31]], %stack.10, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.10, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY32:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_15]].sub2
+    ; CHECK-NEXT: [[COPY32:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec
+    ; CHECK-NEXT: undef [[COPY33:%[0-9]+]].sub0:vreg_128 = COPY [[COPY32]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY33]].sub2:vreg_128 = COPY [[COPY32]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY34:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_16]].sub2
+    ; CHECK-NEXT: [[COPY34:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[COPY35:%[0-9]+]].sub0:vreg_128 = COPY [[COPY34]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY35]].sub2:vreg_128 = COPY [[COPY34]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY36:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_17]].sub2
+    ; CHECK-NEXT: [[COPY36:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[COPY37:%[0-9]+]].sub0:vreg_128 = COPY [[COPY36]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY37]].sub2:vreg_128 = COPY [[COPY36]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY38:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_18]].sub2
+    ; CHECK-NEXT: [[COPY38:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[COPY39:%[0-9]+]].sub0:vreg_128 = COPY [[COPY38]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY39]].sub2:vreg_128 = COPY [[COPY38]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY40:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_19]].sub2
+    ; CHECK-NEXT: [[COPY40:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec
+    ; CHECK-NEXT: undef [[COPY41:%[0-9]+]].sub0:vreg_128 = COPY [[COPY40]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY41]].sub2:vreg_128 = COPY [[COPY40]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_27:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[COPY42:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub2
+    ; CHECK-NEXT: [[COPY42:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[COPY43:%[0-9]+]].sub0:vreg_128 = COPY [[COPY42]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY43]].sub2:vreg_128 = COPY [[COPY42]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_28:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[COPY44:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_28]].sub2
+    ; CHECK-NEXT: [[COPY44:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[COPY45:%[0-9]+]].sub0:vreg_128 = COPY [[COPY44]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY45]].sub2:vreg_128 = COPY [[COPY44]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_29:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[COPY46:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_29]].sub2
+    ; CHECK-NEXT: [[COPY46:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[COPY47:%[0-9]+]].sub0:vreg_128 = COPY [[COPY46]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY47]].sub2:vreg_128 = COPY [[COPY46]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_30:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec
+    ; CHECK-NEXT: undef [[COPY48:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_30]].sub2
+    ; CHECK-NEXT: [[COPY48:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_31:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[COPY49:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_31]].sub2
+    ; CHECK-NEXT: [[COPY49:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_20:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub0, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_21:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub3, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_22:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub2, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_23:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub1, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_24:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub0, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_25:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub3, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_26:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub2, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_26:%[0-9]+]].sub1:vreg_128 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_26:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_26]], [[S_MOV_B32_]], 0, 480, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_25:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_25:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_25]], [[S_MOV_B32_]], 0, 496, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_24:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_24:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_24]], [[S_MOV_B32_]], 0, 448, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_23:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_23:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_23]], [[S_MOV_B32_]], 0, 464, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_22:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_22:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_22]], [[S_MOV_B32_]], 0, 416, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_21:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_21:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_21]], [[S_MOV_B32_]], 0, 432, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_20:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_20:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_20]], [[S_MOV_B32_]], 0, 384, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+    ; CHECK-NEXT: undef [[COPY50:%[0-9]+]].sub0:vreg_128 = COPY [[COPY49]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY50]].sub2:vreg_128 = COPY [[COPY49]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[COPY50:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY50:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY50]], [[S_MOV_B32_]], 0, 400, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef [[COPY51:%[0-9]+]].sub0:vreg_128 = COPY [[COPY48]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY51]].sub2:vreg_128 = COPY [[COPY48]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[COPY51:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY51:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY51]], [[S_MOV_B32_]], 0, 352, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: undef [[COPY52:%[0-9]+]].sub0:vreg_128 = COPY [[COPY47]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY52]].sub2:vreg_128 = COPY [[COPY47]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY53:%[0-9]+]].sub0:vreg_128 = COPY [[COPY52]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY53]].sub2:vreg_128 = COPY [[COPY52]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[COPY53:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY53:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY53]], [[S_MOV_B32_]], 0, 368, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef [[COPY54:%[0-9]+]].sub0:vreg_128 = COPY [[COPY45]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY54]].sub2:vreg_128 = COPY [[COPY45]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY55:%[0-9]+]].sub0:vreg_128 = COPY [[COPY54]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY55]].sub2:vreg_128 = COPY [[COPY54]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[COPY55:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY55:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY55]], [[S_MOV_B32_]], 0, 320, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
+    ; CHECK-NEXT: undef [[COPY56:%[0-9]+]].sub0:vreg_128 = COPY [[COPY43]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY56]].sub2:vreg_128 = COPY [[COPY43]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY57:%[0-9]+]].sub0:vreg_128 = COPY [[COPY56]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY57]].sub2:vreg_128 = COPY [[COPY56]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[COPY57:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY57:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY57]], [[S_MOV_B32_]], 0, 336, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef [[COPY58:%[0-9]+]].sub0:vreg_128 = COPY [[COPY41]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY58]].sub2:vreg_128 = COPY [[COPY41]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY59:%[0-9]+]].sub0:vreg_128 = COPY [[COPY58]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY59]].sub2:vreg_128 = COPY [[COPY58]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[COPY59:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY59:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY59]], [[S_MOV_B32_]], 0, 288, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: undef [[COPY60:%[0-9]+]].sub0:vreg_128 = COPY [[COPY39]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY60]].sub2:vreg_128 = COPY [[COPY39]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY61:%[0-9]+]].sub0:vreg_128 = COPY [[COPY60]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY61]].sub2:vreg_128 = COPY [[COPY60]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[COPY61:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY61:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY61]], [[S_MOV_B32_]], 0, 304, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef [[COPY62:%[0-9]+]].sub0:vreg_128 = COPY [[COPY37]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY62]].sub2:vreg_128 = COPY [[COPY37]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY63:%[0-9]+]].sub0:vreg_128 = COPY [[COPY62]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY63]].sub2:vreg_128 = COPY [[COPY62]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[COPY63:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY63:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY63]], [[S_MOV_B32_]], 0, 256, 0, 0, implicit $exec :: (store (s128), align 256, addrspace 1)
+    ; CHECK-NEXT: undef [[COPY64:%[0-9]+]].sub0:vreg_128 = COPY [[COPY35]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY64]].sub2:vreg_128 = COPY [[COPY35]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY65:%[0-9]+]].sub0:vreg_128 = COPY [[COPY64]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY65]].sub2:vreg_128 = COPY [[COPY64]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[COPY65:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY65:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY65]], [[S_MOV_B32_]], 0, 272, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef [[COPY66:%[0-9]+]].sub0:vreg_128 = COPY [[COPY33]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY66]].sub2:vreg_128 = COPY [[COPY33]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY67:%[0-9]+]].sub0:vreg_128 = COPY [[COPY66]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY67]].sub2:vreg_128 = COPY [[COPY66]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[COPY67:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY67:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY67]], [[S_MOV_B32_]], 0, 224, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
     ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.10, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %131.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE]].sub0 {
-    ; CHECK-NEXT:   internal %131.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE]].sub2
+    ; CHECK-NEXT: undef [[COPY68:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY68]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %128.sub0:vreg_128 = COPY %131.sub0 {
-    ; CHECK-NEXT:   internal %128.sub2:vreg_128 = COPY %131.sub2
+    ; CHECK-NEXT: undef [[COPY69:%[0-9]+]].sub0:vreg_128 = COPY [[COPY68]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY69]].sub2:vreg_128 = COPY [[COPY68]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %128.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %128.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %128, %2, 0, 240, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef %125.sub0:vreg_128 = COPY %126.sub0 {
-    ; CHECK-NEXT:   internal %125.sub2:vreg_128 = COPY %126.sub2
+    ; CHECK-NEXT: [[COPY69:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY69:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY69]], [[S_MOV_B32_]], 0, 240, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef [[COPY70:%[0-9]+]].sub0:vreg_128 = COPY [[COPY29]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY70]].sub2:vreg_128 = COPY [[COPY29]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %122.sub0:vreg_128 = COPY %125.sub0 {
-    ; CHECK-NEXT:   internal %122.sub2:vreg_128 = COPY %125.sub2
+    ; CHECK-NEXT: undef [[COPY71:%[0-9]+]].sub0:vreg_128 = COPY [[COPY70]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY71]].sub2:vreg_128 = COPY [[COPY70]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %122.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %122.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %122, %2, 0, 192, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
+    ; CHECK-NEXT: [[COPY71:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY71:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY71]], [[S_MOV_B32_]], 0, 192, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
     ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE1:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.9, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %190.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE1]].sub0 {
-    ; CHECK-NEXT:   internal %190.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE1]].sub2
+    ; CHECK-NEXT: undef [[COPY72:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE1]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY72]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE1]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %120.sub0:vreg_128 = COPY %190.sub0 {
-    ; CHECK-NEXT:   internal %120.sub2:vreg_128 = COPY %190.sub2
+    ; CHECK-NEXT: undef [[COPY73:%[0-9]+]].sub0:vreg_128 = COPY [[COPY72]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY73]].sub2:vreg_128 = COPY [[COPY72]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %120.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %120.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %120, %2, 0, 208, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[COPY73:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY73:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY73]], [[S_MOV_B32_]], 0, 208, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE2:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.11, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %211.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE2]].sub0 {
-    ; CHECK-NEXT:   internal %211.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE2]].sub2
+    ; CHECK-NEXT: undef [[COPY74:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE2]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY74]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE2]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %208.sub0:vreg_128 = COPY %211.sub0 {
-    ; CHECK-NEXT:   internal %208.sub2:vreg_128 = COPY %211.sub2
+    ; CHECK-NEXT: undef [[COPY75:%[0-9]+]].sub0:vreg_128 = COPY [[COPY74]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY75]].sub2:vreg_128 = COPY [[COPY74]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %208.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %208.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %208, %2, 0, 160, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: [[COPY75:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY75:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY75]], [[S_MOV_B32_]], 0, 160, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
     ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE3:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.8, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %178.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE3]].sub0 {
-    ; CHECK-NEXT:   internal %178.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE3]].sub2
+    ; CHECK-NEXT: undef [[COPY76:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE3]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY76]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE3]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %152.sub0:vreg_128 = COPY %178.sub0 {
-    ; CHECK-NEXT:   internal %152.sub2:vreg_128 = COPY %178.sub2
+    ; CHECK-NEXT: undef [[COPY77:%[0-9]+]].sub0:vreg_128 = COPY [[COPY76]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY77]].sub2:vreg_128 = COPY [[COPY76]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %152.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %152.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %152, %2, 0, 176, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef %115.sub0:vreg_128 = COPY %116.sub0 {
-    ; CHECK-NEXT:   internal %115.sub2:vreg_128 = COPY %116.sub2
+    ; CHECK-NEXT: [[COPY77:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY77:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY77]], [[S_MOV_B32_]], 0, 176, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef [[COPY78:%[0-9]+]].sub0:vreg_128 = COPY [[COPY21]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY78]].sub2:vreg_128 = COPY [[COPY21]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %112.sub0:vreg_128 = COPY %115.sub0 {
-    ; CHECK-NEXT:   internal %112.sub2:vreg_128 = COPY %115.sub2
+    ; CHECK-NEXT: undef [[COPY79:%[0-9]+]].sub0:vreg_128 = COPY [[COPY78]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY79]].sub2:vreg_128 = COPY [[COPY78]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %112.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %112.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %112, %2, 0, 128, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
-    ; CHECK-NEXT: undef %109.sub0:vreg_128 = COPY %110.sub0 {
-    ; CHECK-NEXT:   internal %109.sub2:vreg_128 = COPY %110.sub2
+    ; CHECK-NEXT: [[COPY79:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY79:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY79]], [[S_MOV_B32_]], 0, 128, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+    ; CHECK-NEXT: undef [[COPY80:%[0-9]+]].sub0:vreg_128 = COPY [[COPY19]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY80]].sub2:vreg_128 = COPY [[COPY19]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %106.sub0:vreg_128 = COPY %109.sub0 {
-    ; CHECK-NEXT:   internal %106.sub2:vreg_128 = COPY %109.sub2
+    ; CHECK-NEXT: undef [[COPY81:%[0-9]+]].sub0:vreg_128 = COPY [[COPY80]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY81]].sub2:vreg_128 = COPY [[COPY80]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %106.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %106.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %106, %2, 0, 144, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[COPY81:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY81:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY81]], [[S_MOV_B32_]], 0, 144, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE4:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.6, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %103.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE4]].sub0 {
-    ; CHECK-NEXT:   internal %103.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE4]].sub2
+    ; CHECK-NEXT: undef [[COPY82:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE4]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY82]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE4]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %100.sub0:vreg_128 = COPY %103.sub0 {
-    ; CHECK-NEXT:   internal %100.sub2:vreg_128 = COPY %103.sub2
+    ; CHECK-NEXT: undef [[COPY83:%[0-9]+]].sub0:vreg_128 = COPY [[COPY82]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY83]].sub2:vreg_128 = COPY [[COPY82]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %100.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %100.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %100, %2, 0, 96, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: [[COPY83:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY83:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY83]], [[S_MOV_B32_]], 0, 96, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
     ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE5:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.7, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %97.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE5]].sub0 {
-    ; CHECK-NEXT:   internal %97.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE5]].sub2
+    ; CHECK-NEXT: undef [[COPY84:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE5]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY84]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE5]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %94.sub0:vreg_128 = COPY %97.sub0 {
-    ; CHECK-NEXT:   internal %94.sub2:vreg_128 = COPY %97.sub2
+    ; CHECK-NEXT: undef [[COPY85:%[0-9]+]].sub0:vreg_128 = COPY [[COPY84]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY85]].sub2:vreg_128 = COPY [[COPY84]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %94.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %94.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %94, %2, 0, 112, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[COPY85:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY85:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY85]], [[S_MOV_B32_]], 0, 112, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE6:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.5, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %89.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE6]].sub0 {
-    ; CHECK-NEXT:   internal %89.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE6]].sub2
+    ; CHECK-NEXT: undef [[COPY86:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE6]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY86]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE6]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %86.sub0:vreg_128 = COPY %89.sub0 {
-    ; CHECK-NEXT:   internal %86.sub2:vreg_128 = COPY %89.sub2
+    ; CHECK-NEXT: undef [[COPY87:%[0-9]+]].sub0:vreg_128 = COPY [[COPY86]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY87]].sub2:vreg_128 = COPY [[COPY86]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %86.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %86.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %86, %2, 0, 64, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
+    ; CHECK-NEXT: [[COPY87:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY87:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY87]], [[S_MOV_B32_]], 0, 64, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
     ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE7:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %81.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE7]].sub0 {
-    ; CHECK-NEXT:   internal %81.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE7]].sub2
+    ; CHECK-NEXT: undef [[COPY88:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE7]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY88]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE7]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %78.sub0:vreg_128 = COPY %81.sub0 {
-    ; CHECK-NEXT:   internal %78.sub2:vreg_128 = COPY %81.sub2
+    ; CHECK-NEXT: undef [[COPY89:%[0-9]+]].sub0:vreg_128 = COPY [[COPY88]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY89]].sub2:vreg_128 = COPY [[COPY88]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %78.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %78.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %78, %2, 0, 80, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[COPY89:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY89:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY89]], [[S_MOV_B32_]], 0, 80, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE8:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.3, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %73.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE8]].sub0 {
-    ; CHECK-NEXT:   internal %73.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE8]].sub2
+    ; CHECK-NEXT: undef [[COPY90:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE8]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY90]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE8]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %70.sub0:vreg_128 = COPY %73.sub0 {
-    ; CHECK-NEXT:   internal %70.sub2:vreg_128 = COPY %73.sub2
+    ; CHECK-NEXT: undef [[COPY91:%[0-9]+]].sub0:vreg_128 = COPY [[COPY90]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY91]].sub2:vreg_128 = COPY [[COPY90]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %70.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %70.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %70, %2, 0, 32, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: [[COPY91:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY91:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY91]], [[S_MOV_B32_]], 0, 32, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
     ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE9:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %65.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE9]].sub0 {
-    ; CHECK-NEXT:   internal %65.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE9]].sub2
+    ; CHECK-NEXT: undef [[COPY92:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE9]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY92]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE9]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %62.sub0:vreg_128 = COPY %65.sub0 {
-    ; CHECK-NEXT:   internal %62.sub2:vreg_128 = COPY %65.sub2
+    ; CHECK-NEXT: undef [[COPY93:%[0-9]+]].sub0:vreg_128 = COPY [[COPY92]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY93]].sub2:vreg_128 = COPY [[COPY92]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %62.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %62.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %62, %2, 0, 48, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[COPY93:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY93:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY93]], [[S_MOV_B32_]], 0, 48, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE10:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %57.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE10]].sub0 {
-    ; CHECK-NEXT:   internal %57.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE10]].sub2
+    ; CHECK-NEXT: undef [[COPY94:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE10]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY94]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE10]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %54.sub0:vreg_128 = COPY %57.sub0 {
-    ; CHECK-NEXT:   internal %54.sub2:vreg_128 = COPY %57.sub2
+    ; CHECK-NEXT: undef [[COPY95:%[0-9]+]].sub0:vreg_128 = COPY [[COPY94]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY95]].sub2:vreg_128 = COPY [[COPY94]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %54.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %54.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %54, %2, 0, 0, 0, 0, implicit $exec :: (store (s128), align 512, addrspace 1)
+    ; CHECK-NEXT: [[COPY95:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY95:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY95]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (store (s128), align 512, addrspace 1)
     ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE11:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %49.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE11]].sub0 {
-    ; CHECK-NEXT:   internal %49.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE11]].sub2
+    ; CHECK-NEXT: undef [[COPY96:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE11]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY96]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE11]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %46.sub0:vreg_128 = COPY %49.sub0 {
-    ; CHECK-NEXT:   internal %46.sub2:vreg_128 = COPY %49.sub2
+    ; CHECK-NEXT: undef [[COPY97:%[0-9]+]].sub0:vreg_128 = COPY [[COPY96]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY97]].sub2:vreg_128 = COPY [[COPY96]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %46.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %46.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %46, %2, 0, 16, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[COPY97:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY97:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY97]], [[S_MOV_B32_]], 0, 16, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: S_ENDPGM 0
     %0:sgpr_64(p4) = COPY $sgpr0_sgpr1
     %1:sgpr_128 = S_LOAD_DWORDX4_IMM %0(p4), 9, 0 :: (dereferenceable invariant load (s128), align 4, addrspace 4)
diff --git a/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir b/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir
index e10d882aeb29..b428e859a6d3 100644
--- a/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir
+++ b/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir
@@ -20,18 +20,18 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   %0.sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load (s32), addrspace 3)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def %0, 2147549193 /* reguse tiedto:$1 */, %0(tied-def 3)
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (load (s32), addrspace 3)
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def [[V_MOV_B32_e32_]], 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_]](tied-def 3)
   ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_]]
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def undef %0.sub0, 851978 /* regdef:VGPR_16 */, def undef %0.sub1
-  ; CHECK-NEXT:   S_NOP 0, implicit %0.sub1
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def undef [[V_MOV_B32_e32_]].sub0, 851978 /* regdef:VGPR_16 */, def undef [[V_MOV_B32_e32_]].sub1
+  ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
   ; CHECK-NEXT:   $sgpr10 = S_MOV_B32 -1
   ; CHECK-NEXT:   S_BRANCH %bb.1
   bb.0:
@@ -61,18 +61,18 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   %0.sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load (s32), addrspace 3)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def %0, 2147549193 /* reguse tiedto:$1 */, %0(tied-def 3)
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (load (s32), addrspace 3)
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def [[V_MOV_B32_e32_]], 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_]](tied-def 3)
   ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_]]
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def undef %0.sub1, 851978 /* regdef:VGPR_16 */, def undef %0.sub0
-  ; CHECK-NEXT:   S_NOP 0, implicit %0.sub1
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def undef [[V_MOV_B32_e32_]].sub1, 851978 /* regdef:VGPR_16 */, def undef [[V_MOV_B32_e32_]].sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
   ; CHECK-NEXT:   $sgpr10 = S_MOV_B32 -1
   ; CHECK-NEXT:   S_BRANCH %bb.1
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/trans-forwarding-hazards.mir b/llvm/test/CodeGen/AMDGPU/trans-forwarding-hazards.mir
index 24e0b11786f5..2843d83d3a2f 100644
--- a/llvm/test/CodeGen/AMDGPU/trans-forwarding-hazards.mir
+++ b/llvm/test/CodeGen/AMDGPU/trans-forwarding-hazards.mir
@@ -12,6 +12,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 4095
     ; GFX11-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX1150-LABEL: name: trans_use_1_hazard
     ; GFX1150: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
     ; GFX1150-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
@@ -61,6 +62,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 4095
     ; GFX11-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX1150-LABEL: name: trans_use_2_hazard
     ; GFX1150: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
     ; GFX1150-NEXT: $sgpr0 = S_MOV_B32 0
@@ -108,6 +110,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 4095
     ; GFX11-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX1150-LABEL: name: trans_use_3_hazard
     ; GFX1150: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
     ; GFX1150-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
@@ -207,6 +210,7 @@ body:            |
     ; GFX11-NEXT: $vgpr5 = V_ADD_F32_e32 $vgpr1, $vgpr4, implicit $mode, implicit $exec
     ; GFX11-NEXT: $vgpr7 = V_ADD_F32_e32 $vgpr3, $vgpr6, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX1150-LABEL: name: trans_use_4_one_depctr_1
     ; GFX1150: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
     ; GFX1150-NEXT: $vgpr3 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec
@@ -231,6 +235,7 @@ body:            |
     ; GFX11-NEXT: $vgpr5 = V_ADD_F32_e32 $vgpr3, $vgpr4, implicit $mode, implicit $exec
     ; GFX11-NEXT: $vgpr7 = V_ADD_F32_e32 $vgpr1, $vgpr6, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX1150-LABEL: name: trans_use_4_one_depctr_2
     ; GFX1150: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
     ; GFX1150-NEXT: $vgpr3 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec
@@ -255,6 +260,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 4095
     ; GFX11-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX1150-LABEL: name: trans_use_4
     ; GFX1150: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
     ; GFX1150-NEXT: $vgpr10 = V_SQRT_F32_e32 $vgpr11, implicit $mode, implicit $exec
@@ -293,6 +299,7 @@ body:            |
   ; GFX11-NEXT:   S_WAITCNT_DEPCTR 4095
   ; GFX11-NEXT:   $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
   ; GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX1150-LABEL: name: trans_use_branching_1a
   ; GFX1150: bb.0:
   ; GFX1150-NEXT:   successors: %bb.2(0x80000000)
@@ -353,6 +360,7 @@ body:            |
   ; GFX11-NEXT:   S_WAITCNT_DEPCTR 4095
   ; GFX11-NEXT:   $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
   ; GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX1150-LABEL: name: trans_use_branching_1b
   ; GFX1150: bb.0:
   ; GFX1150-NEXT:   successors: %bb.2(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
index 5b7702485205..eef5f57beb07 100644
--- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
+++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
@@ -5,8 +5,9 @@
 define amdgpu_ps float @simple_test_return_to_epilog(float %a) #0 {
   ; GCN-LABEL: name: simple_test_return_to_epilog
   ; GCN: bb.0.entry:
-  ; GCN:   liveins: $vgpr0
-  ; GCN:   SI_RETURN_TO_EPILOG killed $vgpr0
+  ; GCN-NEXT:   liveins: $vgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   SI_RETURN_TO_EPILOG killed $vgpr0
 entry:
   ret float %a
 }
@@ -14,20 +15,26 @@ entry:
 define amdgpu_ps float @test_return_to_epilog_into_end_block(i32 inreg %a, float %b) #0 {
   ; GCN-LABEL: name: test_return_to_epilog_into_end_block
   ; GCN: bb.0.entry:
-  ; GCN:   successors: %bb.1(0x80000000), %bb.2(0x00000000)
-  ; GCN:   liveins: $sgpr2, $vgpr0
-  ; GCN:   S_CMP_LT_I32 killed renamable $sgpr2, 1, implicit-def $scc
-  ; GCN:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
-  ; GCN: bb.1.if:
-  ; GCN:   successors: %bb.3(0x80000000)
-  ; GCN:   liveins: $vgpr0
-  ; GCN:   S_BRANCH %bb.3
-  ; GCN: bb.2.else:
-  ; GCN:   successors:
-  ; GCN:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; GCN:   GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GCN:   S_WAITCNT_soft 3952
-  ; GCN: bb.3:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000), %bb.2(0x00000000)
+  ; GCN-NEXT:   liveins: $sgpr2, $vgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_CMP_LT_I32 killed renamable $sgpr2, 1, implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1.if:
+  ; GCN-NEXT:   successors: %bb.3(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_BRANCH %bb.3
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2.else:
+  ; GCN-NEXT:   successors:
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+  ; GCN-NEXT:   S_WAITCNT_soft 3952
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.3:
 entry:
   %cc = icmp sgt i32 %a, 0
   br i1 %cc, label %if, label %else
@@ -41,30 +48,40 @@ else:                                             ; preds = %entry
 define amdgpu_ps float @test_unify_return_to_epilog_into_end_block(i32 inreg %a, i32 inreg %b, float %c, float %d) #0 {
   ; GCN-LABEL: name: test_unify_return_to_epilog_into_end_block
   ; GCN: bb.0.entry:
-  ; GCN:   successors: %bb.1(0x50000000), %bb.2(0x30000000)
-  ; GCN:   liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1
-  ; GCN:   S_CMP_LT_I32 killed renamable $sgpr2, 1, implicit-def $scc
-  ; GCN:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
-  ; GCN: bb.1.if:
-  ; GCN:   successors: %bb.5(0x80000000)
-  ; GCN:   liveins: $vgpr0
-  ; GCN:   S_BRANCH %bb.5
-  ; GCN: bb.2.else.if.cond:
-  ; GCN:   successors: %bb.3(0x80000000), %bb.4(0x00000000)
-  ; GCN:   liveins: $sgpr3, $vgpr1
-  ; GCN:   S_CMP_LT_I32 killed renamable $sgpr3, 1, implicit-def $scc
-  ; GCN:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
-  ; GCN: bb.3.else.if:
-  ; GCN:   successors: %bb.5(0x80000000)
-  ; GCN:   liveins: $vgpr1
-  ; GCN:   $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec, implicit $exec
-  ; GCN:   S_BRANCH %bb.5
-  ; GCN: bb.4.else:
-  ; GCN:   successors:
-  ; GCN:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; GCN:   GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GCN:   S_WAITCNT_soft 3952
-  ; GCN: bb.5:
+  ; GCN-NEXT:   successors: %bb.1(0x50000000), %bb.2(0x30000000)
+  ; GCN-NEXT:   liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_CMP_LT_I32 killed renamable $sgpr2, 1, implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1.if:
+  ; GCN-NEXT:   successors: %bb.5(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_BRANCH %bb.5
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2.else.if.cond:
+  ; GCN-NEXT:   successors: %bb.3(0x80000000), %bb.4(0x00000000)
+  ; GCN-NEXT:   liveins: $sgpr3, $vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_CMP_LT_I32 killed renamable $sgpr3, 1, implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.3.else.if:
+  ; GCN-NEXT:   successors: %bb.5(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec, implicit $exec
+  ; GCN-NEXT:   S_BRANCH %bb.5
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.4.else:
+  ; GCN-NEXT:   successors:
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+  ; GCN-NEXT:   S_WAITCNT_soft 3952
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.5:
 entry:
   %cc = icmp sgt i32 %a, 0
   br i1 %cc, label %if, label %else.if.cond
@@ -81,60 +98,77 @@ else:                                             ; preds = %else.if.cond
 }
 
 define amdgpu_ps { <4 x float> } @test_return_to_epilog_with_optimized_kill(float %val) #0 {
-; GCN-LABEL: name: test_return_to_epilog_with_optimized_kill
-; GCN: bb.0 (%ir-block.0):
-; GCN:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
-; GCN:   liveins: $vgpr0
-; GCN:   renamable $vgpr1 = nofpexcept V_RCP_F32_e32 $vgpr0, implicit $mode, implicit $exec
-; GCN:   $sgpr0_sgpr1 = S_MOV_B64 $exec
-; GCN:   nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr1, implicit-def $vcc, implicit $mode, implicit $exec
-; GCN:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
-; GCN:   renamable $sgpr2_sgpr3 = S_XOR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def dead $scc
-; GCN:   S_CBRANCH_EXECNZ %bb.3, implicit $exec
-; GCN: bb.1.Flow1:
-; GCN:   successors: %bb.6(0x40000000), %bb.2(0x40000000)
-; GCN:   liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
-; GCN:   $sgpr2_sgpr3 = S_ANDN2_SAVEEXEC_B64 killed $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec
-; GCN:   S_CBRANCH_EXECNZ %bb.6, implicit $exec
-; GCN: bb.2.end:
-; GCN:   successors: %bb.9(0x80000000)
-; GCN:   liveins: $sgpr2_sgpr3
-; GCN:   $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc
-; GCN:   S_BRANCH %bb.9
-; GCN: bb.3.flow.preheader:
-; GCN:   successors: %bb.4(0x80000000)
-; GCN:   liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3
-; GCN:   nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $mode, implicit $exec
-; GCN:   renamable $sgpr4_sgpr5 = S_MOV_B64 0
-; GCN: bb.4.flow:
-; GCN:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
-; GCN:   liveins: $vcc, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5
-; GCN:   renamable $sgpr6_sgpr7 = S_AND_B64 $exec, renamable $vcc, implicit-def $scc
-; GCN:   renamable $sgpr4_sgpr5 = S_OR_B64 killed renamable $sgpr6_sgpr7, killed renamable $sgpr4_sgpr5, implicit-def $scc
-; GCN:   $exec = S_ANDN2_B64 $exec, renamable $sgpr4_sgpr5, implicit-def $scc
-; GCN:   S_CBRANCH_EXECNZ %bb.4, implicit $exec
-; GCN: bb.5.Flow:
-; GCN:   successors: %bb.6(0x40000000), %bb.2(0x40000000)
-; GCN:   liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5
-; GCN:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
-; GCN:   $sgpr2_sgpr3 = S_ANDN2_SAVEEXEC_B64 killed $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec
-; GCN:   S_CBRANCH_EXECZ %bb.2, implicit $exec
-; GCN: bb.6.kill0:
-; GCN:   successors: %bb.7(0x40000000), %bb.8(0x40000000)
-; GCN:   liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
-; GCN:   dead renamable $sgpr0_sgpr1 = S_ANDN2_B64 killed renamable $sgpr0_sgpr1, $exec, implicit-def $scc
-; GCN:   S_CBRANCH_SCC0 %bb.8, implicit $scc
-; GCN: bb.7.kill0:
-; GCN:   successors: %bb.9(0x80000000)
-; GCN:   liveins: $sgpr2_sgpr3, $scc
-; GCN:   $exec = S_MOV_B64 0
-; GCN:   $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc
-; GCN:   S_BRANCH %bb.9
-; GCN: bb.8:
-; GCN:   $exec = S_MOV_B64 0
-; GCN:   EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec
-; GCN:   S_ENDPGM 0
-; GCN: bb.9:
+  ; GCN-LABEL: name: test_return_to_epilog_with_optimized_kill
+  ; GCN: bb.0 (%ir-block.0):
+  ; GCN-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; GCN-NEXT:   liveins: $vgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   renamable $vgpr1 = nofpexcept V_RCP_F32_e32 $vgpr0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GCN-NEXT:   nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr1, implicit-def $vcc, implicit $mode, implicit $exec
+  ; GCN-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GCN-NEXT:   renamable $sgpr2_sgpr3 = S_XOR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def dead $scc
+  ; GCN-NEXT:   S_CBRANCH_EXECNZ %bb.3, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1.Flow1:
+  ; GCN-NEXT:   successors: %bb.6(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $sgpr2_sgpr3 = S_ANDN2_SAVEEXEC_B64 killed $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GCN-NEXT:   S_CBRANCH_EXECNZ %bb.6, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2.end:
+  ; GCN-NEXT:   successors: %bb.9(0x80000000)
+  ; GCN-NEXT:   liveins: $sgpr2_sgpr3
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc
+  ; GCN-NEXT:   S_BRANCH %bb.9
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.3.flow.preheader:
+  ; GCN-NEXT:   successors: %bb.4(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $mode, implicit $exec
+  ; GCN-NEXT:   renamable $sgpr4_sgpr5 = S_MOV_B64 0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.4.flow:
+  ; GCN-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
+  ; GCN-NEXT:   liveins: $vcc, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   renamable $sgpr6_sgpr7 = S_AND_B64 $exec, renamable $vcc, implicit-def $scc
+  ; GCN-NEXT:   renamable $sgpr4_sgpr5 = S_OR_B64 killed renamable $sgpr6_sgpr7, killed renamable $sgpr4_sgpr5, implicit-def $scc
+  ; GCN-NEXT:   $exec = S_ANDN2_B64 $exec, renamable $sgpr4_sgpr5, implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_EXECNZ %bb.4, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.5.Flow:
+  ; GCN-NEXT:   successors: %bb.6(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
+  ; GCN-NEXT:   $sgpr2_sgpr3 = S_ANDN2_SAVEEXEC_B64 killed $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GCN-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.6.kill0:
+  ; GCN-NEXT:   successors: %bb.7(0x40000000), %bb.8(0x40000000)
+  ; GCN-NEXT:   liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   dead renamable $sgpr0_sgpr1 = S_ANDN2_B64 killed renamable $sgpr0_sgpr1, $exec, implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.8, implicit $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.7.kill0:
+  ; GCN-NEXT:   successors: %bb.9(0x80000000)
+  ; GCN-NEXT:   liveins: $sgpr2_sgpr3, $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $exec = S_MOV_B64 0
+  ; GCN-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc
+  ; GCN-NEXT:   S_BRANCH %bb.9
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.8:
+  ; GCN-NEXT:   $exec = S_MOV_B64 0
+  ; GCN-NEXT:   EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec
+  ; GCN-NEXT:   S_ENDPGM 0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.9:
   %.i0 = fdiv reassoc nnan nsz arcp contract afn float 1.000000e+00, %val
   %cmp0 = fcmp olt float %.i0, 0.000000e+00
   br i1 %cmp0, label %kill0, label %flow
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
index d1cc927b5565..e668c1d2b7f3 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
@@ -9,16 +9,16 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY killed $sgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY killed $sgpr1
-  ; CHECK-NEXT:   undef %0.sub0:sreg_64 = COPY killed [[COPY]]
-  ; CHECK-NEXT:   %0.sub1:sreg_64 = COPY killed [[COPY1]]
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 0, 0 :: (invariant load (<2 x s32>) from %ir.ptr, align 4, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed %0, 8, 0 :: (invariant load (s32) from %ir.ptr + 8, addrspace 4)
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-  ; CHECK-NEXT:   $sgpr0 = COPY killed [[COPY2]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
-  ; CHECK-NEXT:   $sgpr1 = COPY killed [[COPY3]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]]
-  ; CHECK-NEXT:   $sgpr2 = COPY killed [[COPY4]]
+  ; CHECK-NEXT:   undef [[COPY2:%[0-9]+]].sub0:sreg_64 = COPY killed [[COPY]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]].sub1:sreg_64 = COPY killed [[COPY1]]
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (invariant load (<2 x s32>) from %ir.ptr, align 4, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[COPY2]], 8, 0 :: (invariant load (s32) from %ir.ptr + 8, addrspace 4)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+  ; CHECK-NEXT:   $sgpr0 = COPY killed [[COPY3]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
+  ; CHECK-NEXT:   $sgpr1 = COPY killed [[COPY4]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]]
+  ; CHECK-NEXT:   $sgpr2 = COPY killed [[COPY5]]
   ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit killed $sgpr0, implicit killed $sgpr1, implicit killed $sgpr2
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 4
   ret <3 x i32> %load
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-regsequence.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-regsequence.mir
index 3b641092fb99..f45a918d1d0f 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-regsequence.mir
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-regsequence.mir
@@ -12,9 +12,9 @@ body:             |
     ; CHECK-LABEL: name: f
     ; CHECK: liveins: $vgpr0, $vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: undef %2.sub0:vreg_64 = COPY $vgpr0
-    ; CHECK-NEXT: %2.sub1:vreg_64 = COPY $vgpr1
-    ; CHECK-NEXT: $vgpr2_vgpr3 = COPY %2
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
+    ; CHECK-NEXT: $vgpr2_vgpr3 = COPY [[COPY]]
     ; CHECK-NEXT: S_NOP 0, implicit $vgpr2_vgpr3
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/undef-subreg-use-after-coalesce.mir b/llvm/test/CodeGen/AMDGPU/undef-subreg-use-after-coalesce.mir
index 99c955319c64..c71bc2b9c456 100644
--- a/llvm/test/CodeGen/AMDGPU/undef-subreg-use-after-coalesce.mir
+++ b/llvm/test/CodeGen/AMDGPU/undef-subreg-use-after-coalesce.mir
@@ -12,9 +12,9 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; CHECK-LABEL: name: undef_subreg_use_after_full_copy_coalesce_0
-    ; CHECK: undef %0.sub0:vreg_96 = V_MOV_B32_e32 0, implicit $exec
-    ; CHECK-NEXT: dead %0.sub1:vreg_96 = V_MOV_B32_e32 0, implicit $exec
-    ; CHECK-NEXT: S_ENDPGM 0, implicit undef %0.sub2
+    ; CHECK: undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_96 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: dead [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_96 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: S_ENDPGM 0, implicit undef [[V_MOV_B32_e32_]].sub2
     undef %0.sub0:vreg_96 = V_MOV_B32_e32 0, implicit $exec
     %0.sub1:vreg_96 = V_MOV_B32_e32 0, implicit $exec
     %1:vreg_96 = COPY killed %0
@@ -30,8 +30,8 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; CHECK-LABEL: name: undef_subreg_use_after_full_copy_coalesce_composed
-    ; CHECK: undef %0.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec
-    ; CHECK-NEXT: dead %0.sub1:vreg_128 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK: undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: dead [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_128 = V_MOV_B32_e32 0, implicit $exec
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[DEF]].sub1
     undef %0.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec
@@ -66,10 +66,10 @@ body:             |
     ; CHECK-LABEL: name: undef_subreg_use_after_full_copy_coalesce_1
     ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: undef %2.sub0:vreg_96 = COPY $vgpr0
-    ; CHECK-NEXT: %2.sub1:vreg_96 = COPY $vgpr1
-    ; CHECK-NEXT: S_NOP 0, implicit undef %2.sub2
-    ; CHECK-NEXT: S_NOP 0, implicit %2.sub1
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_96 = COPY $vgpr1
+    ; CHECK-NEXT: S_NOP 0, implicit undef [[COPY]].sub2
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY]].sub1
     ; CHECK-NEXT: S_ENDPGM 0
     %0:vgpr_32 = COPY killed $vgpr0
     %1:vgpr_32 = COPY killed $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
index 801092238a10..65feaf23ae2c 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
@@ -51,6 +51,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_getpc1
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
     ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
@@ -75,6 +76,7 @@ body:            |
     ; GFX11-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-lo) @mem + 16, implicit-def $scc, implicit $scc
     ; GFX11-NEXT: }
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_getpc2
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
     ; GFX12-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
@@ -101,6 +103,7 @@ body:            |
     ; GFX11-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_vcc1
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
     ; GFX12-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
@@ -119,6 +122,7 @@ body:            |
     ; GFX11-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_vcc2
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
     ; GFX12-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
@@ -137,6 +141,7 @@ body:            |
     ; GFX11-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_cndmask_dpp1
     ; GFX12: $vgpr0 = V_CNDMASK_B32_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, 1, 15, 15, 1, implicit $vcc, implicit $exec
     ; GFX12-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
@@ -155,6 +160,7 @@ body:            |
     ; GFX11-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_cndmask_dpp2
     ; GFX12: $vgpr0 = V_CNDMASK_B32_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec
     ; GFX12-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
@@ -173,6 +179,7 @@ body:            |
     ; GFX11-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_cndmask_dpp4
     ; GFX12: $vgpr0 = V_CNDMASK_B16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec
     ; GFX12-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
@@ -191,6 +198,7 @@ body:            |
     ; GFX11-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_addc1
     ; GFX12: $vgpr1, $vcc = V_ADDC_U32_e64 0, $vgpr1, $sgpr2_sgpr3, 0, implicit $exec
     ; GFX12-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
@@ -209,6 +217,7 @@ body:            |
     ; GFX11-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_addc2
     ; GFX12: $vgpr1 = V_ADDC_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
     ; GFX12-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
@@ -227,6 +236,7 @@ body:            |
     ; GFX11-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_addc3
     ; GFX12: $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr1, $vgpr2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
     ; GFX12-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
@@ -245,6 +255,7 @@ body:            |
     ; GFX11-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_addc4
     ; GFX12: $vgpr0, $sgpr2_sgpr3 = V_ADDC_U32_e64_dpp $vgpr0, $vgpr1, $vgpr2, $sgpr2_sgpr3, 0, 1, 15, 15, 1, implicit $exec
     ; GFX12-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
@@ -263,6 +274,7 @@ body:            |
     ; GFX11-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subb1
     ; GFX12: $vgpr1, $vcc = V_SUBB_U32_e64 0, $vgpr1, $sgpr2_sgpr3, 0, implicit $exec
     ; GFX12-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
@@ -281,6 +293,7 @@ body:            |
     ; GFX11-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subb2
     ; GFX12: $vgpr1 = V_SUBB_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
     ; GFX12-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
@@ -299,6 +312,7 @@ body:            |
     ; GFX11-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subb3
     ; GFX12: $vgpr0 = V_SUBB_U32_dpp $vgpr0, $vgpr1, $vgpr2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
     ; GFX12-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
@@ -317,6 +331,7 @@ body:            |
     ; GFX11-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subb4
     ; GFX12: $vgpr0, $sgpr2_sgpr3 = V_SUBB_U32_e64_dpp $vgpr0, $vgpr1, $vgpr2, $sgpr2_sgpr3, 0, 1, 15, 15, 1, implicit $exec
     ; GFX12-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
@@ -335,6 +350,7 @@ body:            |
     ; GFX11-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subbrev1
     ; GFX12: $vgpr1, $vcc = V_SUBBREV_U32_e64 0, $vgpr1, $sgpr2_sgpr3, 0, implicit $exec
     ; GFX12-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
@@ -353,6 +369,7 @@ body:            |
     ; GFX11-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subbrev2
     ; GFX12: $vgpr1 = V_SUBBREV_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
     ; GFX12-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
@@ -371,6 +388,7 @@ body:            |
     ; GFX11-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subbrev3
     ; GFX12: $vgpr0 = V_SUBBREV_U32_dpp $vgpr0, $vgpr1, $vgpr2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
     ; GFX12-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
@@ -389,6 +407,7 @@ body:            |
     ; GFX11-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subbrev4
     ; GFX12: $vgpr0, $sgpr2_sgpr3 = V_SUBBREV_U32_e64_dpp $vgpr0, $vgpr1, $vgpr2, $sgpr2_sgpr3, 0, 1, 15, 15, 1, implicit $exec
     ; GFX12-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
@@ -407,6 +426,7 @@ body:            |
     ; GFX11-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_div_fmas_f32
     ; GFX12: $vgpr0 = V_DIV_FMAS_F32_e64 0, $vgpr1, 0, $vgpr2, 0, $vgpr3, 0, 0, implicit $mode, implicit $vcc, implicit $exec
     ; GFX12-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
@@ -425,6 +445,7 @@ body:            |
     ; GFX11-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_div_fmas_f64
     ; GFX12: $vgpr0_vgpr1 = V_DIV_FMAS_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $vcc, implicit $exec
     ; GFX12-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
@@ -444,6 +465,7 @@ body:            |
     ; GFX11-NEXT: $sgpr2 = S_MOV_B32 0
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subreg1
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
     ; GFX12-NEXT: $sgpr2 = S_MOV_B32 0
@@ -463,6 +485,7 @@ body:            |
     ; GFX11-NEXT: $sgpr3 = S_MOV_B32 0
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subreg2
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
     ; GFX12-NEXT: $sgpr3 = S_MOV_B32 0
@@ -483,6 +506,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: $sgpr3 = S_MOV_B32 0
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subreg3
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
     ; GFX12-NEXT: $sgpr2 = S_MOV_B32 0
@@ -505,6 +529,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: $sgpr2 = S_MOV_B32 $vcc_lo
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subreg4
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
     ; GFX12-NEXT: $vcc_lo = S_MOV_B32 0
@@ -527,6 +552,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: $sgpr2 = S_MOV_B32 $vcc_hi
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subreg5
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
     ; GFX12-NEXT: $vcc_hi = S_MOV_B32 0
@@ -550,6 +576,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_waitcnt
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
     ; GFX12-NEXT: S_WAITCNT 0
@@ -576,6 +603,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_gap1
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
     ; GFX12-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
@@ -603,6 +631,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_gap2
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
     ; GFX12-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit $mode
@@ -628,6 +657,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_gap3
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
     ; GFX12-NEXT: $vgpr2 = V_WRITELANE_B32 $exec_lo, 0, $vgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
index 137bd0f5d9f1..4efa1e9353ab 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
@@ -28,6 +28,7 @@ define amdgpu_cs void @max_6_vgprs(ptr addrspace(1) %p) "amdgpu-num-vgpr"="6" {
 ; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v5, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    scratch_load_b32 v0, off, off th:TH_LOAD_LU ; 4-byte Folded Reload
@@ -116,6 +117,7 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
 ; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v10, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
@@ -174,6 +176,7 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
 ; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v10, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
@@ -208,6 +211,7 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; CHECK-NEXT:    scratch_load_b32 v0, off, off th:TH_LOAD_LU ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
+; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir b/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir
index 5cbecacc4ae7..08f5550f3b08 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir
@@ -12,8 +12,8 @@ body: |
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; CHECK-NEXT:   liveins: $sgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %4.sub0:vreg_96 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   %4.sub1:vreg_96 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_96 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_96 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0
   ; CHECK-NEXT:   $exec = S_MOV_B64_term [[COPY]]
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
@@ -22,11 +22,11 @@ body: |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   %4.sub0:vreg_96 = V_MUL_F32_e32 %4.sub0, %4.sub0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   %4.sub1:vreg_96 = V_MUL_F32_e32 %4.sub1, %4.sub1, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_96 = V_MUL_F32_e32 [[V_MOV_B32_e32_]].sub0, [[V_MOV_B32_e32_]].sub0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_96 = V_MUL_F32_e32 [[V_MOV_B32_e32_]].sub1, [[V_MOV_B32_e32_]].sub1, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   S_ENDPGM 0, implicit %4
+  ; CHECK-NEXT:   S_ENDPGM 0, implicit [[V_MOV_B32_e32_]]
   bb.0:
     liveins: $sgpr0
     %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir b/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir
index 93b98df2f7db..dd3572c027c8 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir
@@ -38,6 +38,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   S_ENDPGM 0
+  ;
   ; GFX9-FLATSCR-LABEL: name: vgpr32_save_clobber_scc
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -56,6 +57,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.2:
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0
+  ;
   ; GFX10-FLATSCR-LABEL: name: vgpr32_save_clobber_scc
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -73,6 +75,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.2:
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0
+  ;
   ; VMEM-GFX8-LABEL: name: vgpr32_save_clobber_scc
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -135,6 +138,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   S_ENDPGM 0
+  ;
   ; GFX9-FLATSCR-LABEL: name: vgpr64_save_clobber_scc
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -153,6 +157,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.2:
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0
+  ;
   ; GFX10-FLATSCR-LABEL: name: vgpr64_save_clobber_scc
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -170,6 +175,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.2:
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0
+  ;
   ; VMEM-GFX8-LABEL: name: vgpr64_save_clobber_scc
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -230,6 +236,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   S_ENDPGM 0
+  ;
   ; GFX9-FLATSCR-LABEL: name: vgpr32_restore_clobber_scc
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -247,6 +254,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.2:
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0
+  ;
   ; GFX10-FLATSCR-LABEL: name: vgpr32_restore_clobber_scc
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -263,6 +271,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.2:
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0
+  ;
   ; VMEM-GFX8-LABEL: name: vgpr32_restore_clobber_scc
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -322,6 +331,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   S_ENDPGM 0
+  ;
   ; GFX9-FLATSCR-LABEL: name: vgpr64_restore_clobber_scc
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -339,6 +349,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.2:
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0
+  ;
   ; GFX10-FLATSCR-LABEL: name: vgpr64_restore_clobber_scc
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -355,6 +366,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.2:
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0
+  ;
   ; VMEM-GFX8-LABEL: name: vgpr64_restore_clobber_scc
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -418,6 +430,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX9-FLATSCR-LABEL: name: vgpr32_restore_clobber_scc_emergency_stack_slot
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -439,6 +452,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.2:
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX10-FLATSCR-LABEL: name: vgpr32_restore_clobber_scc_emergency_stack_slot
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -459,6 +473,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.2:
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; VMEM-GFX8-LABEL: name: vgpr32_restore_clobber_scc_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -528,6 +543,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX9-FLATSCR-LABEL: name: vgpr64_restore_clobber_scc_emergency_stack_slot
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -549,6 +565,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.2:
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX10-FLATSCR-LABEL: name: vgpr64_restore_clobber_scc_emergency_stack_slot
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -569,6 +586,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.2:
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; VMEM-GFX8-LABEL: name: vgpr64_restore_clobber_scc_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -640,6 +658,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX9-FLATSCR-LABEL: name: vgpr96_restore_clobber_scc_emergency_stack_slot
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -661,6 +680,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.2:
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX10-FLATSCR-LABEL: name: vgpr96_restore_clobber_scc_emergency_stack_slot
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -681,6 +701,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.2:
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; VMEM-GFX8-LABEL: name: vgpr96_restore_clobber_scc_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -751,6 +772,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX9-FLATSCR-LABEL: name: vgpr32_save_clobber_scc_emergency_stack_slot
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -772,6 +794,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.2:
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX10-FLATSCR-LABEL: name: vgpr32_save_clobber_scc_emergency_stack_slot
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -792,6 +815,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.2:
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; VMEM-GFX8-LABEL: name: vgpr32_save_clobber_scc_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -861,6 +885,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX9-FLATSCR-LABEL: name: vgpr64_save_clobber_scc_emergency_stack_slot
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -882,6 +907,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.2:
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX10-FLATSCR-LABEL: name: vgpr64_save_clobber_scc_emergency_stack_slot
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -902,6 +928,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.2:
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; VMEM-GFX8-LABEL: name: vgpr64_save_clobber_scc_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -973,6 +1000,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX9-FLATSCR-LABEL: name: vgpr96_save_clobber_scc_emergency_stack_slot
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -994,6 +1022,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.2:
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX10-FLATSCR-LABEL: name: vgpr96_save_clobber_scc_emergency_stack_slot
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1014,6 +1043,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.2:
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; VMEM-GFX8-LABEL: name: vgpr96_save_clobber_scc_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1080,6 +1110,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX9-FLATSCR-LABEL: name: mubuf_load_restore_clobber_scc
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1099,6 +1130,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.2:
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX10-FLATSCR-LABEL: name: mubuf_load_restore_clobber_scc
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1118,6 +1150,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.2:
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; VMEM-GFX8-LABEL: name: mubuf_load_restore_clobber_scc
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1184,6 +1217,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX9-FLATSCR-LABEL: name: mubuf_load_restore_clobber_scc_no_vgprs_emergency_stack_slot
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1207,6 +1241,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.2:
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX10-FLATSCR-LABEL: name: mubuf_load_restore_clobber_scc_no_vgprs_emergency_stack_slot
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1230,6 +1265,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.2:
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; VMEM-GFX8-LABEL: name: mubuf_load_restore_clobber_scc_no_vgprs_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1298,6 +1334,7 @@ body:             |
   ; MUBUF-NEXT:   liveins: $vgpr0
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT:   S_ENDPGM 0, implicit $vgpr0
+  ;
   ; GFX9-FLATSCR-LABEL: name: v_mov_clobber_scc
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1318,6 +1355,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT:   liveins: $vgpr0
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0, implicit $vgpr0
+  ;
   ; GFX10-FLATSCR-LABEL: name: v_mov_clobber_scc
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1338,6 +1376,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT:   liveins: $vgpr0
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0, implicit $vgpr0
+  ;
   ; VMEM-GFX8-LABEL: name: v_mov_clobber_scc
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
diff --git a/llvm/test/CodeGen/AMDGPU/vopd-combine.mir b/llvm/test/CodeGen/AMDGPU/vopd-combine.mir
index 92cdc27b653a..3c1da043bcd6 100644
--- a/llvm/test/CodeGen/AMDGPU/vopd-combine.mir
+++ b/llvm/test/CodeGen/AMDGPU/vopd-combine.mir
@@ -38,12 +38,14 @@ body:             |
     ; SCHED-NEXT: $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
     ; SCHED-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_schedule
     ; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_schedule
     ; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -73,12 +75,14 @@ body:             |
     ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
     ; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 10, killed $vgpr3, implicit $mode, implicit $exec
     ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_fmamk
     ; PAIR-GFX11: $vgpr2 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx11 killed $vgpr0, 10, killed $vgpr3, killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_fmamk
     ; PAIR-GFX12: $vgpr2 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr0 = IMPLICIT_DEF
@@ -108,6 +112,7 @@ body:             |
     ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
     ; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 10, killed $vgpr4, implicit $mode, implicit $exec
     ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+    ;
     ; PAIR-LABEL: name: vopd_fmamk_fail
     ; PAIR: $vgpr1 = IMPLICIT_DEF
     ; PAIR-NEXT: $vgpr2 = V_XOR_B32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
@@ -142,6 +147,7 @@ body:             |
     ; SCHED-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc
     ; SCHED-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec
     ; SCHED-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_cndmask
     ; PAIR-GFX11: $vgpr2 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF
@@ -153,6 +159,7 @@ body:             |
     ; PAIR-GFX11-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc
     ; PAIR-GFX11-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_cndmask
     ; PAIR-GFX12: $vgpr2 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr0 = IMPLICIT_DEF
@@ -192,10 +199,12 @@ body:             |
     ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
     ; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr0, implicit $exec
     ; SCHED-NEXT: $vgpr3 = V_ADD_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_mov
     ; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx11 killed $vgpr0, killed $vgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_mov
     ; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -218,10 +227,12 @@ body:             |
     ; SCHED-NEXT: $sgpr7 = IMPLICIT_DEF
     ; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr0, implicit $exec
     ; SCHED-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr7, implicit $exec
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_mov_mov
     ; PAIR-GFX11: $sgpr0 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $sgpr7 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx11 killed $sgpr0, killed $sgpr7, implicit $exec, implicit $exec, implicit $exec
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_mov_mov
     ; PAIR-GFX12: $sgpr0 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $sgpr7 = IMPLICIT_DEF
@@ -247,6 +258,7 @@ body:             |
     ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
     ; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 100, killed $vgpr3, implicit $mode, implicit $exec
     ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 99, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+    ;
     ; PAIR-LABEL: name: vopd_constants_fail
     ; PAIR: $vgpr2 = IMPLICIT_DEF
     ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF
@@ -276,12 +288,14 @@ body:             |
     ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
     ; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 100, killed $vgpr3, implicit $mode, implicit $exec
     ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 4, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_constants_inlinable
     ; PAIR-GFX11: $vgpr2 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx11 killed $vgpr0, 100, killed $vgpr3, 4, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_constants_inlinable
     ; PAIR-GFX12: $vgpr2 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr0 = IMPLICIT_DEF
@@ -312,12 +326,14 @@ body:             |
     ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
     ; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 100, killed $vgpr3, implicit $mode, implicit $exec
     ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 100, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_constants_same
     ; PAIR-GFX11: $vgpr2 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx11 killed $vgpr0, 100, killed $vgpr3, 100, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_constants_same
     ; PAIR-GFX12: $vgpr2 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr0 = IMPLICIT_DEF
@@ -345,10 +361,12 @@ body:             |
     ; SCHED-NEXT: $sgpr0 = IMPLICIT_DEF
     ; SCHED-NEXT: $vgpr1 = V_MOV_B32_e32 981467136, implicit $exec
     ; SCHED-NEXT: $vgpr2 = V_FMAAK_F32 killed $sgpr0, killed $vgpr0, 981467136, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_mov_fmaak_constants_same
     ; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $sgpr0 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr1, $vgpr2 = V_DUAL_MOV_B32_e32_X_FMAAK_F32_gfx11 981467136, killed $sgpr0, killed $vgpr0, 981467136, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_mov_fmaak_constants_same
     ; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $sgpr0 = IMPLICIT_DEF
@@ -373,11 +391,13 @@ body:             |
     ; SCHED-NEXT: $vgpr3 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; SCHED-NEXT: DBG_VALUE $vgpr0, 0, 0
     ; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_debug
     ; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 killed $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: DBG_VALUE $vgpr0, 0, 0
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_debug
     ; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -416,6 +436,7 @@ body:             |
     ; SCHED-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $vcc
     ; SCHED-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; SCHED-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_schedule_unconstrained
     ; PAIR-GFX11: $vgpr2 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
@@ -431,6 +452,7 @@ body:             |
     ; PAIR-GFX11-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $vcc
     ; PAIR-GFX11-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_schedule_unconstrained
     ; PAIR-GFX12: $vgpr2 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr3 = IMPLICIT_DEF
@@ -507,6 +529,7 @@ body:             |
     ; SCHED-NEXT: $vgpr33 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $vcc
     ; SCHED-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; SCHED-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_schedule_unconstrained_2
     ; PAIR-GFX11: $vgpr2 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
@@ -531,6 +554,7 @@ body:             |
     ; PAIR-GFX11-NEXT: $vgpr33 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $vcc
     ; PAIR-GFX11-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_schedule_unconstrained_2
     ; PAIR-GFX12: $vgpr2 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr3 = IMPLICIT_DEF
@@ -607,11 +631,13 @@ body: |
     ; SCHED-NEXT: $vgpr3 = V_ADD_F32_e32 killed $vgpr0, killed $vgpr1, implicit $mode, implicit $exec
     ; SCHED-NEXT: $vgpr4 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec
     ; SCHED-NEXT: $vgpr5 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_mov_fixup
     ; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx11 target-flags(amdgpu-abs32-lo) @lds, killed $vgpr0, killed $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr4, $vgpr5 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx11 target-flags(amdgpu-abs32-lo) @lds, target-flags(amdgpu-abs32-lo) @lds, implicit $exec, implicit $exec, implicit $exec
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_mov_fixup
     ; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -635,6 +661,7 @@ body: |
     ; SCHED-LABEL: name: vopd_mov_fixup_fail
     ; SCHED: $vgpr0 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec
     ; SCHED-NEXT: $vgpr1 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds + 4, implicit $exec
+    ;
     ; PAIR-LABEL: name: vopd_mov_fixup_fail
     ; PAIR: $vgpr0 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec
     ; PAIR-NEXT: $vgpr1 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds + 4, implicit $exec
@@ -652,6 +679,7 @@ body:             |
     ; SCHED: $vgpr0 = IMPLICIT_DEF
     ; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit-def $vgpr2_vgpr3, implicit $exec
     ; SCHED-NEXT: $vgpr5 = V_ADD_F32_e32 killed $vgpr0, killed $vgpr3, implicit $mode, implicit $exec
+    ;
     ; PAIR-LABEL: name: vopd_no_combine_dependent_subreg
     ; PAIR: $vgpr0 = IMPLICIT_DEF
     ; PAIR-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit-def $vgpr2_vgpr3, implicit $exec
@@ -672,11 +700,13 @@ body:             |
     ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
     ; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
     ; SCHED-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr5, implicit $exec
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_mov_mov_same_src_bank
     ; PAIR-GFX11: $vgpr1 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr5 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr5, implicit $exec
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_mov_mov_same_src_bank
     ; PAIR-GFX12: $vgpr1 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr5 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
new file mode 100644
index 000000000000..e6fbe97f8dc0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+
+define amdgpu_ps void @intrinsic_store_system_scope(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX12-LABEL: intrinsic_store_system_scope:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    buffer_store_b32 v0, v[1:2], s[0:3], s4 idxen offen scope:SCOPE_SYS
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  call void @llvm.amdgcn.struct.buffer.store.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 24)
+  ret void
+}
+
+define amdgpu_ps void @generic_store_volatile(i32 %val, ptr addrspace(1) %out) {
+; GFX12-LABEL: generic_store_volatile:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b32 v[1:2], v0, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  store volatile i32 %val, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir
new file mode 100644
index 000000000000..acf8bd3a6ab5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir
@@ -0,0 +1,43 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=si-memory-legalizer  %s -o - | FileCheck -check-prefix=GFX12 %s
+
+---
+name: intrinsic_store_system_scope
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX12-LABEL: name: intrinsic_store_system_scope
+    ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: S_WAIT_LOADCNT_soft 0
+    ; GFX12-NEXT: S_WAIT_SAMPLECNT_soft 0
+    ; GFX12-NEXT: S_WAIT_BVHCNT_soft 0
+    ; GFX12-NEXT: S_WAIT_KMCNT_soft 0
+    ; GFX12-NEXT: S_WAIT_STORECNT_soft 0
+    ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 24, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+    ; GFX12-NEXT: S_ENDPGM 0
+    BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 24, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+    S_ENDPGM 0
+...
+
+---
+name: generic_store_volatile
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX12-LABEL: name: generic_store_volatile
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: S_WAIT_LOADCNT_soft 0
+    ; GFX12-NEXT: S_WAIT_SAMPLECNT_soft 0
+    ; GFX12-NEXT: S_WAIT_BVHCNT_soft 0
+    ; GFX12-NEXT: S_WAIT_KMCNT_soft 0
+    ; GFX12-NEXT: S_WAIT_STORECNT_soft 0
+    ; GFX12-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr1_vgpr2, killed renamable $vgpr0, 0, 24, implicit $exec :: (volatile store (s32), addrspace 1)
+    ; GFX12-NEXT: S_WAIT_STORECNT_soft 0
+    ; GFX12-NEXT: S_ENDPGM 0
+    GLOBAL_STORE_DWORD killed renamable $vgpr1_vgpr2, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vinterp.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vinterp.mir
index e764d36b3b29..f382800bfd39 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vinterp.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vinterp.mir
@@ -9,14 +9,16 @@ body: |
   bb.0:
     liveins: $vgpr0
     ; GFX11-LABEL: name: waitcnt-vinterp
-    ; GFX11: $vgpr1 = LDS_PARAM_LOAD 0, 0, 0, implicit $m0, implicit $exec
-    ; GFX11: $vgpr2 = LDS_PARAM_LOAD 0, 1, 0, implicit $m0, implicit $exec
-    ; GFX11: $vgpr3 = LDS_PARAM_LOAD 0, 2, 0, implicit $m0, implicit $exec
-    ; GFX11: $vgpr4 = LDS_PARAM_LOAD 0, 3, 0, implicit $m0, implicit $exec
-    ; GFX11: $vgpr5 = V_INTERP_P10_F16_F32_inreg 0, $vgpr1, 0, $vgpr0, 0, $vgpr1, 0, 0, 2, implicit $m0, implicit $exec, implicit $mode
-    ; GFX11: $vgpr6 = V_INTERP_P10_F16_F32_inreg 0, $vgpr2, 0, $vgpr0, 0, $vgpr2, 0, 0, 2, implicit $m0, implicit $exec, implicit $mode
-    ; GFX11: $vgpr7 = V_INTERP_P10_F16_F32_inreg 0, $vgpr3, 0, $vgpr0, 0, $vgpr3, 0, 0, 1, implicit $m0, implicit $exec, implicit $mode
-    ; GFX11: $vgpr8 = V_INTERP_P10_F16_F32_inreg 0, $vgpr4, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, implicit $m0, implicit $exec, implicit $mode
+    ; GFX11: liveins: $vgpr0
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 0, implicit $m0, implicit $exec
+    ; GFX11-NEXT: $vgpr2 = LDS_PARAM_LOAD 0, 1, 0, implicit $m0, implicit $exec
+    ; GFX11-NEXT: $vgpr3 = LDS_PARAM_LOAD 0, 2, 0, implicit $m0, implicit $exec
+    ; GFX11-NEXT: $vgpr4 = LDS_PARAM_LOAD 0, 3, 0, implicit $m0, implicit $exec
+    ; GFX11-NEXT: $vgpr5 = V_INTERP_P10_F16_F32_inreg 0, $vgpr1, 0, $vgpr0, 0, $vgpr1, 0, 0, 2, implicit $m0, implicit $exec, implicit $mode
+    ; GFX11-NEXT: $vgpr6 = V_INTERP_P10_F16_F32_inreg 0, $vgpr2, 0, $vgpr0, 0, $vgpr2, 0, 0, 2, implicit $m0, implicit $exec, implicit $mode
+    ; GFX11-NEXT: $vgpr7 = V_INTERP_P10_F16_F32_inreg 0, $vgpr3, 0, $vgpr0, 0, $vgpr3, 0, 0, 1, implicit $m0, implicit $exec, implicit $mode
+    ; GFX11-NEXT: $vgpr8 = V_INTERP_P10_F16_F32_inreg 0, $vgpr4, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, implicit $m0, implicit $exec, implicit $mode
     $vgpr1 = LDS_PARAM_LOAD 0, 0, 0, implicit $m0, implicit $exec
     $vgpr2 = LDS_PARAM_LOAD 0, 1, 0, implicit $m0, implicit $exec
     $vgpr3 = LDS_PARAM_LOAD 0, 2, 0, implicit $m0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/wqm-terminators.mir b/llvm/test/CodeGen/AMDGPU/wqm-terminators.mir
index b4df02e47d2a..8d75bb3b1280 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm-terminators.mir
+++ b/llvm/test/CodeGen/AMDGPU/wqm-terminators.mir
@@ -34,12 +34,12 @@ body: |
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CMP_EQ_U32 [[COPY1]], 0, implicit-def $scc
-  ; CHECK-NEXT:   undef %5.sub0:vreg_64 = V_MUL_F32_e64 0, [[COPY2]].sub0, 0, [[COPY2]].sub1, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   %5.sub1:vreg_64 = V_MUL_F32_e64 0, [[COPY2]].sub0, 0, [[COPY2]].sub1, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MUL_F32_e64_:%[0-9]+]].sub0:vreg_64 = V_MUL_F32_e64 0, [[COPY2]].sub0, 0, [[COPY2]].sub1, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MUL_F32_e64_:%[0-9]+]].sub1:vreg_64 = V_MUL_F32_e64 0, [[COPY2]].sub0, 0, [[COPY2]].sub1, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $scc
   ; CHECK-NEXT:   $exec_lo = S_AND_B32 $exec_lo, [[COPY]], implicit-def $scc
   ; CHECK-NEXT:   $scc = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[IMAGE_SAMPLE_V3_V2_gfx10_:%[0-9]+]]:vreg_96 = IMAGE_SAMPLE_V3_V2_gfx10 %5, [[DEF]], [[DEF1]], 7, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8)
+  ; CHECK-NEXT:   [[IMAGE_SAMPLE_V3_V2_gfx10_:%[0-9]+]]:vreg_96 = IMAGE_SAMPLE_V3_V2_gfx10 [[V_MUL_F32_e64_]], [[DEF]], [[DEF1]], 7, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8)
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
   ; CHECK-NEXT:   S_BRANCH %bb.1
   ; CHECK-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/ARM/cmse-vlldm-no-reorder.mir b/llvm/test/CodeGen/ARM/cmse-vlldm-no-reorder.mir
index 2bc4288884f1..d006aa9ba38e 100644
--- a/llvm/test/CodeGen/ARM/cmse-vlldm-no-reorder.mir
+++ b/llvm/test/CodeGen/ARM/cmse-vlldm-no-reorder.mir
@@ -89,7 +89,7 @@ body:             |
 # CHECK: $sp = t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, $r4, $r5, $r6, undef $r7, $r8, $r9, $r10, $r11
 # CHECK-NEXT:  $r0 = t2BICri $r0, 1, 14 /* CC::al */, $noreg, $noreg
 # CHECK-NEXT:  $sp = tSUBspi $sp, 34, 14 /* CC::al */, $noreg
-# CHECK-NEXT:  VLSTM $sp, 14 /* CC::al */, $noreg, implicit undef $vpr, implicit undef $fpscr, implicit undef $fpscr_nzcv, implicit undef $q0, implicit undef $q1, implicit undef $q2, implicit undef $q3, implicit undef $q4, implicit undef $q5, implicit undef $q6, implicit undef $q7
+# CHECK-NEXT:  VLSTM $sp, 14 /* CC::al */, $noreg, 0, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv, implicit undef $vpr, implicit undef $fpscr, implicit undef $fpscr_nzcv, implicit undef $d0, implicit undef $d1, implicit undef $d2, implicit undef $d3, implicit undef $d4, implicit undef $d5, implicit undef $d6, implicit undef $d7, implicit $d8, implicit $d9, implicit $d10, implicit $d11, implicit $d12, implicit $d13, implicit $d14, implicit $d15
 # CHECK-NEXT:  $r1 = tMOVr $r0, 14 /* CC::al */, $noreg
 # CHECK-NEXT:  $r2 = tMOVr $r0, 14 /* CC::al */, $noreg
 # CHECK-NEXT:  $r3 = tMOVr $r0, 14 /* CC::al */, $noreg
@@ -105,7 +105,7 @@ body:             |
 # CHECK-NEXT:  t2MSR_M 3072, $r0, 14 /* CC::al */, $noreg, implicit-def $cpsr
 # CHECK-NEXT:  tBLXNSr 14 /* CC::al */, $noreg, killed $r0, csr_aapcs, implicit-def $lr, implicit $sp, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $s0
 # CHECK-NEXT:  $r12 = VMOVRS $s0, 14 /* CC::al */, $noreg
-# CHECK-NEXT:  VLLDM $sp, 14 /* CC::al */, $noreg, implicit-def $q0, implicit-def $q1, implicit-def $q2, implicit-def $q3, implicit-def $q4, implicit-def $q5, implicit-def $q6, implicit-def $q7, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv
+# CHECK-NEXT:  VLLDM $sp, 14 /* CC::al */, $noreg, 0, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv, implicit-def $d0, implicit-def $d1, implicit-def $d2, implicit-def $d3, implicit-def $d4, implicit-def $d5, implicit-def $d6, implicit-def $d7, implicit-def $d8, implicit-def $d9, implicit-def $d10, implicit-def $d11, implicit-def $d12, implicit-def $d13, implicit-def $d14, implicit-def $d15
 # CHECK-NEXT:  $s0 = VMOVSR $r12, 14 /* CC::al */, $noreg
 # CHECK-NEXT:  $sp = tADDspi $sp, 34, 14 /* CC::al */, $noreg
 # CHECK-NEXT:  $sp = t2LDMIA_UPD $sp, 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11
diff --git a/llvm/test/CodeGen/ARM/vlldm-vlstm-uops.mir b/llvm/test/CodeGen/ARM/vlldm-vlstm-uops.mir
index 8c49a5316741..ad53addcc21a 100644
--- a/llvm/test/CodeGen/ARM/vlldm-vlstm-uops.mir
+++ b/llvm/test/CodeGen/ARM/vlldm-vlstm-uops.mir
@@ -2,7 +2,7 @@
 --- |
   target triple = "thumbv8m.main-arm-none-eabi"
 
-  define hidden void @foo(ptr nocapture %baz) local_unnamed_addr #0 {
+  define hidden void @foo(void ()* nocapture %baz) local_unnamed_addr #0 {
   entry:
     %call = call i32 @bar() #0
     %tobool = icmp eq i32 %call, 0
@@ -55,14 +55,13 @@ body:             |
     tBL 14, $noreg, @bar, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def dead $r0
 
   bb.2.land.end:
-    liveins: $r4
-
+    liveins: $r4, $vpr, $fpscr, $fpscr_nzcv, $d0, $d1, $d2, $d3, $d4, $d5, $d6, $d7, $d8, $d9, $d10, $d11, $d12, $d13, $d14, $d15
     $sp = t2STMDB_UPD $sp, 14, $noreg, $r4, killed $r5, killed $r6, killed $r7, killed $r8, killed $r9, killed $r10, killed $r11
     $r4 = t2BICri $r4, 1, 14, $noreg, $noreg
     $sp = tSUBspi $sp, 34, 14, $noreg
-    VLSTM $sp, 14, $noreg
+    VLSTM $sp, 14, $noreg, 0, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv, implicit $vpr, implicit $fpscr, implicit $fpscr_nzcv, implicit $d0, implicit $d1, implicit $d2, implicit $d3, implicit $d4, implicit $d5, implicit $d6, implicit $d7, implicit $d8, implicit $d9, implicit $d10, implicit $d11, implicit $d12, implicit $d13, implicit $d14, implicit $d15
     tBLXNSr 14, $noreg, killed $r4, csr_aapcs, implicit-def $lr, implicit $sp, implicit-def dead $lr, implicit $sp, implicit-def $sp
-    VLLDM $sp, 14, $noreg, implicit-def $q0, implicit-def $q1, implicit-def $q2, implicit-def $q3, implicit-def $q4, implicit-def $q5, implicit-def $q6, implicit-def $q7, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv
+    VLLDM $sp, 14, $noreg, 0, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv, implicit-def $d0, implicit-def $d1, implicit-def $d2, implicit-def $d3, implicit-def $d4, implicit-def $d5, implicit-def $d6, implicit-def $d7, implicit-def $d8, implicit-def $d9, implicit-def $d10, implicit-def $d11, implicit-def $d12, implicit-def $d13, implicit-def $d14, implicit-def $d15
     $sp = tADDspi $sp, 34, 14, $noreg
     $sp = t2LDMIA_UPD $sp, 14, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11
     $sp = t2LDMIA_RET $sp, 14, $noreg, def $r4, def $pc
diff --git a/llvm/test/CodeGen/Generic/expand-vp-load-store.ll b/llvm/test/CodeGen/Generic/expand-vp-load-store.ll
index 8984d020f99e..5c6f1e858ce7 100644
--- a/llvm/test/CodeGen/Generic/expand-vp-load-store.ll
+++ b/llvm/test/CodeGen/Generic/expand-vp-load-store.ll
@@ -127,7 +127,7 @@ define <vscale x 1 x i64> @vpload_nxv1i64_allones_mask(ptr %ptr, i32 zeroext %ev
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP2]], <vscale x 1 x i64> poison)
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
 ;
-  %load = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %evl)
+  %load = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 1 x i64> %load
 }
 
@@ -140,7 +140,7 @@ define <vscale x 1 x i64> @vpload_nxv1i64_allones_mask_vscale(ptr %ptr) {
 ;
   %vscale = call i32 @llvm.vscale.i32()
   %vlmax = mul nuw i32 %vscale, 1
-  %load = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %vlmax)
+  %load = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %vlmax)
   ret <vscale x 1 x i64> %load
 }
 
@@ -179,7 +179,7 @@ define void @vpstore_nxv1i64_allones_mask(<vscale x 1 x i64> %val, ptr %ptr, i32
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[VAL:%.*]], ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP2]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.vp.store.nxv1i64.p0(<vscale x 1 x i64> %val, ptr %ptr, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %evl)
+  call void @llvm.vp.store.nxv1i64.p0(<vscale x 1 x i64> %val, ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret void
 }
 
@@ -192,7 +192,7 @@ define void @vpstore_nxv1i64_allones_mask_vscale(<vscale x 1 x i64> %val, ptr %p
 ;
   %vscale = call i32 @llvm.vscale.i32()
   %vlmax = mul nuw i32 %vscale, 1
-  call void @llvm.vp.store.nxv1i64.p0(<vscale x 1 x i64> %val, ptr %ptr, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %vlmax)
+  call void @llvm.vp.store.nxv1i64.p0(<vscale x 1 x i64> %val, ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %vlmax)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/Hexagon/post-inc-vec.mir b/llvm/test/CodeGen/Hexagon/post-inc-vec.mir
deleted file mode 100644
index 3788dc3fecd8..000000000000
--- a/llvm/test/CodeGen/Hexagon/post-inc-vec.mir
+++ /dev/null
@@ -1,413 +0,0 @@
-#RUN: llc -march=hexagon -run-pass hexagon-postincopt %s -o - | FileCheck  %s
-
-# Test that we do not generate two post-increment vector load/store
-# in the loop.
-# CHECK: J2_loop0r
-# CHECK: V6_vS32b_pi
-# CHECK-NOT: = V6_vL32b_pi
-# CHECK: V6_vL32b_ai
-# CHECK: V6_vL32b_ai
-# CHECK: V6_vS32b_ai
-# CHECK: ENDLOOP0
-
---- |
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-  declare <1024 x i1> @llvm.hexagon.V6.pred.scalar2v2.128B(i32) #0
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
-  declare void @llvm.hexagon.V6.vS32b.qpred.ai.128B(<1024 x i1>, ptr, <32 x i32>) #1
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-  declare <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32>, <32 x i32>) #0
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-  declare <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32>) #0
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-  declare <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32>) #0
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-  declare <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32) #0
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-  declare <32 x i32> @llvm.hexagon.V6.vasrhbsat.128B(<32 x i32>, <32 x i32>, i32) #0
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-  declare <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32>, <32 x i32>) #0
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-  declare <32 x i32> @llvm.hexagon.V6.vasrh.128B(<32 x i32>, i32) #0
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-  declare <32 x i32> @llvm.hexagon.V6.vavgh.128B(<32 x i32>, <32 x i32>) #0
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-  declare <64 x i32> @llvm.hexagon.V6.vmpabusv.128B(<64 x i32>, <64 x i32>) #0
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
-  declare void @llvm.assume(i1 noundef) #2
-
-  ; Function Attrs: noinline nounwind
-  define void @blah(i32 %0, i32 %1, ptr noalias %2, ptr noalias nocapture readonly %3, ptr noalias nocapture readonly %4, ptr nocapture readnone %5, ptr nocapture readnone %6, i32 %7, i32 %8, ptr nocapture readonly %9, ptr nocapture readonly %10) local_unnamed_addr #3 {
-  entry:
-    %11 = call i32 @llvm.hexagon.S2.extractu(i32 %0, i32 23, i32 9)
-    %12 = shl i32 %11, 7
-    %mul16.i = mul nsw i32 %12, %1
-    %add.i = add nsw i32 %1, 1
-    %mul17.i = mul nsw i32 %add.i, %12
-    %cmp184.i = icmp slt i32 %mul16.i, %mul17.i
-    br i1 %cmp184.i, label %for.body.lr.ph.i, label %for.end.i
-
-  for.body.lr.ph.i:                                 ; preds = %entry
-    %13 = tail call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> <i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576>, <32 x i32> <i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144>) #5
-    %14 = tail call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> zeroinitializer, <32 x i32> zeroinitializer) #5
-    %15 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 32) #5
-    %cgep = getelementptr i8, ptr %2, i32 %mul16.i
-    %cgep8 = getelementptr i8, ptr %4, i32 %mul16.i
-    %cgep9 = getelementptr i8, ptr %3, i32 %mul16.i
-    br label %for.body.i
-
-  for.body.i:                                       ; preds = %for.body.i, %for.body.lr.ph.i
-    %lsr.iv6 = phi ptr [ %cgep12, %for.body.i ], [ %cgep9, %for.body.lr.ph.i ]
-    %lsr.iv3 = phi ptr [ %cgep11, %for.body.i ], [ %cgep8, %for.body.lr.ph.i ]
-    %lsr.iv = phi ptr [ %cgep10, %for.body.i ], [ %cgep, %for.body.lr.ph.i ]
-    %elemIdx.05.i = phi i32 [ %mul16.i, %for.body.lr.ph.i ], [ %add19.i, %for.body.i ]
-    %16 = load <32 x i32>, ptr %lsr.iv6, align 128
-    %17 = load <32 x i32>, ptr %lsr.iv3, align 128
-    %18 = tail call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> %17, <32 x i32> %16) #5
-    %19 = tail call <64 x i32> @llvm.hexagon.V6.vmpabusv.128B(<64 x i32> %13, <64 x i32> %18) #5
-    %20 = tail call <64 x i32> @llvm.hexagon.V6.vmpabusv.128B(<64 x i32> %14, <64 x i32> %18) #5
-    %21 = tail call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %19) #5
-    %22 = tail call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %20) #5
-    %23 = tail call <32 x i32> @llvm.hexagon.V6.vasrh.128B(<32 x i32> %22, i32 7) #5
-    %24 = tail call <32 x i32> @llvm.hexagon.V6.vavgh.128B(<32 x i32> %21, <32 x i32> %23) #5
-    %25 = tail call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %19) #5
-    %26 = tail call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %20) #5
-    %27 = tail call <32 x i32> @llvm.hexagon.V6.vasrh.128B(<32 x i32> %26, i32 7) #5
-    %28 = tail call <32 x i32> @llvm.hexagon.V6.vavgh.128B(<32 x i32> %25, <32 x i32> %27) #5
-    %29 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %24, <32 x i32> %15) #5
-    %30 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %28, <32 x i32> %15) #5
-    %31 = tail call <32 x i32> @llvm.hexagon.V6.vasrhbsat.128B(<32 x i32> %29, <32 x i32> %30, i32 4) #5
-    store <32 x i32> %31, ptr %lsr.iv, align 128
-    %add19.i = add nsw i32 %elemIdx.05.i, 128
-    %cmp18.i = icmp slt i32 %add19.i, %mul17.i
-    %cgep10 = getelementptr i8, ptr %lsr.iv, i32 128
-    %cgep11 = getelementptr i8, ptr %lsr.iv3, i32 128
-    %cgep12 = getelementptr i8, ptr %lsr.iv6, i32 128
-    br i1 %cmp18.i, label %for.body.i, label %for.end.i
-
-  for.end.i:                                        ; preds = %for.body.i, %entry
-    ret void
-  }
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-  declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #4
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-  declare i32 @llvm.hexagon.S2.extractu(i32, i32 immarg, i32 immarg) #0
-
-  attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
-  attributes #1 = { nocallback nofree nosync nounwind willreturn memory(write) }
-  attributes #2 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
-  attributes #3 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv68" "target-features"="+hvx-length128b,+hvxv68,+v68,-long-calls,-small-data" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-  attributes #5 = { nounwind }
-
-...
----
-name:            blah
-alignment:       16
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHCatchret:   false
-hasEHScopes:     false
-hasEHFunclets:   false
-isOutlined:      false
-debugInstrRef:   false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
-  - { id: 0, class: intregs, preferred-register: '' }
-  - { id: 1, class: intregs, preferred-register: '' }
-  - { id: 2, class: hvxwr, preferred-register: '' }
-  - { id: 3, class: hvxwr, preferred-register: '' }
-  - { id: 4, class: hvxvr, preferred-register: '' }
-  - { id: 5, class: intregs, preferred-register: '' }
-  - { id: 6, class: intregs, preferred-register: '' }
-  - { id: 7, class: intregs, preferred-register: '' }
-  - { id: 8, class: intregs, preferred-register: '' }
-  - { id: 9, class: intregs, preferred-register: '' }
-  - { id: 10, class: intregs, preferred-register: '' }
-  - { id: 11, class: intregs, preferred-register: '' }
-  - { id: 12, class: intregs, preferred-register: '' }
-  - { id: 13, class: intregs, preferred-register: '' }
-  - { id: 14, class: intregs, preferred-register: '' }
-  - { id: 15, class: intregs, preferred-register: '' }
-  - { id: 16, class: intregs, preferred-register: '' }
-  - { id: 17, class: intregs, preferred-register: '' }
-  - { id: 18, class: intregs, preferred-register: '' }
-  - { id: 19, class: intregs, preferred-register: '' }
-  - { id: 20, class: intregs, preferred-register: '' }
-  - { id: 21, class: intregs, preferred-register: '' }
-  - { id: 22, class: intregs, preferred-register: '' }
-  - { id: 23, class: intregs, preferred-register: '' }
-  - { id: 24, class: intregs, preferred-register: '' }
-  - { id: 25, class: predregs, preferred-register: '' }
-  - { id: 26, class: predregs, preferred-register: '' }
-  - { id: 27, class: hvxvr, preferred-register: '' }
-  - { id: 28, class: intregs, preferred-register: '' }
-  - { id: 29, class: hvxvr, preferred-register: '' }
-  - { id: 30, class: intregs, preferred-register: '' }
-  - { id: 31, class: hvxvr, preferred-register: '' }
-  - { id: 32, class: intregs, preferred-register: '' }
-  - { id: 33, class: hvxvr, preferred-register: '' }
-  - { id: 34, class: hvxvr, preferred-register: '' }
-  - { id: 35, class: hvxwr, preferred-register: '' }
-  - { id: 36, class: hvxwr, preferred-register: '' }
-  - { id: 37, class: hvxwr, preferred-register: '' }
-  - { id: 38, class: hvxvr, preferred-register: '' }
-  - { id: 39, class: hvxvr, preferred-register: '' }
-  - { id: 40, class: intregs, preferred-register: '' }
-  - { id: 41, class: hvxvr, preferred-register: '' }
-  - { id: 42, class: hvxvr, preferred-register: '' }
-  - { id: 43, class: hvxvr, preferred-register: '' }
-  - { id: 44, class: hvxvr, preferred-register: '' }
-  - { id: 45, class: hvxvr, preferred-register: '' }
-  - { id: 46, class: hvxvr, preferred-register: '' }
-  - { id: 47, class: hvxvr, preferred-register: '' }
-  - { id: 48, class: hvxvr, preferred-register: '' }
-  - { id: 49, class: intregslow8, preferred-register: '' }
-  - { id: 50, class: hvxvr, preferred-register: '' }
-  - { id: 51, class: predregs, preferred-register: '' }
-  - { id: 52, class: intregs, preferred-register: '' }
-  - { id: 53, class: intregs, preferred-register: '' }
-  - { id: 54, class: intregs, preferred-register: '' }
-  - { id: 55, class: intregs, preferred-register: '' }
-  - { id: 56, class: intregs, preferred-register: '' }
-  - { id: 57, class: intregs, preferred-register: '' }
-  - { id: 58, class: intregs, preferred-register: '' }
-  - { id: 59, class: intregs, preferred-register: '' }
-  - { id: 60, class: intregs, preferred-register: '' }
-  - { id: 61, class: hvxvr, preferred-register: '' }
-  - { id: 62, class: intregs, preferred-register: '' }
-  - { id: 63, class: hvxvr, preferred-register: '' }
-  - { id: 64, class: intregs, preferred-register: '' }
-  - { id: 65, class: hvxwr, preferred-register: '' }
-  - { id: 66, class: hvxwr, preferred-register: '' }
-  - { id: 67, class: hvxwr, preferred-register: '' }
-  - { id: 68, class: hvxvr, preferred-register: '' }
-  - { id: 69, class: hvxvr, preferred-register: '' }
-  - { id: 70, class: hvxvr, preferred-register: '' }
-  - { id: 71, class: hvxvr, preferred-register: '' }
-  - { id: 72, class: hvxvr, preferred-register: '' }
-  - { id: 73, class: hvxvr, preferred-register: '' }
-  - { id: 74, class: hvxvr, preferred-register: '' }
-  - { id: 75, class: intregs, preferred-register: '' }
-  - { id: 76, class: intregs, preferred-register: '' }
-  - { id: 77, class: intregs, preferred-register: '' }
-  - { id: 78, class: intregs, preferred-register: '' }
-  - { id: 79, class: hvxvr, preferred-register: '' }
-  - { id: 80, class: intregs, preferred-register: '' }
-  - { id: 81, class: hvxvr, preferred-register: '' }
-  - { id: 82, class: intregs, preferred-register: '' }
-  - { id: 83, class: hvxwr, preferred-register: '' }
-  - { id: 84, class: hvxwr, preferred-register: '' }
-  - { id: 85, class: hvxwr, preferred-register: '' }
-  - { id: 86, class: hvxvr, preferred-register: '' }
-  - { id: 87, class: hvxvr, preferred-register: '' }
-  - { id: 88, class: hvxvr, preferred-register: '' }
-  - { id: 89, class: hvxvr, preferred-register: '' }
-  - { id: 90, class: hvxvr, preferred-register: '' }
-  - { id: 91, class: hvxvr, preferred-register: '' }
-  - { id: 92, class: hvxvr, preferred-register: '' }
-  - { id: 93, class: intregs, preferred-register: '' }
-  - { id: 94, class: intregs, preferred-register: '' }
-  - { id: 95, class: intregs, preferred-register: '' }
-  - { id: 96, class: intregs, preferred-register: '' }
-  - { id: 97, class: predregs, preferred-register: '' }
-  - { id: 98, class: predregs, preferred-register: '' }
-liveins:
-  - { reg: '$r0', virtual-reg: '%16' }
-  - { reg: '$r1', virtual-reg: '%17' }
-  - { reg: '$r2', virtual-reg: '%18' }
-  - { reg: '$r3', virtual-reg: '%19' }
-  - { reg: '$r4', virtual-reg: '%20' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    4
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:
-  - { id: 0, type: default, offset: 24, size: 4, alignment: 8, stack-id: default,
-      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 1, type: default, offset: 20, size: 4, alignment: 4, stack-id: default,
-      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 2, type: default, offset: 16, size: 4, alignment: 8, stack-id: default,
-      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 3, type: default, offset: 12, size: 4, alignment: 4, stack-id: default,
-      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 4, type: default, offset: 8, size: 4, alignment: 8, stack-id: default,
-      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-stack:           []
-entry_values:    []
-callSites:       []
-debugValueSubstitutions: []
-constants:       []
-machineFunctionInfo: {}
-body:             |
-  bb.0.entry:
-    successors: %bb.1(0x40000000), %bb.3(0x40000000)
-    liveins: $r0, $r1, $r2, $r3, $r4
-
-    %20:intregs = COPY $r4
-    %19:intregs = COPY $r3
-    %18:intregs = COPY $r2
-    %17:intregs = COPY $r1
-    %16:intregs = COPY $r0
-    %22:intregs = S2_extractu %16, 23, 9
-    %23:intregs = S2_asl_i_r %22, 7
-    %0:intregs = nsw M2_mpyi %23, %17
-    %24:intregs = nsw A2_addi %17, 1
-    %1:intregs = nsw M2_mpyi %24, %23
-    %25:predregs = C2_cmpgt %1, %0
-    J2_jumpf %25, %bb.3, implicit-def dead $pc
-    J2_jump %bb.1, implicit-def dead $pc
-
-  bb.1.for.body.lr.ph.i:
-    successors: %bb.4(0x40000000), %bb.6(0x40000000)
-
-    %28:intregs = A2_tfrsi 269488144
-    %27:hvxvr = V6_lvsplatw %28
-    %30:intregs = A2_tfrsi 1077952576
-    %29:hvxvr = V6_lvsplatw %30
-    %2:hvxwr = REG_SEQUENCE %29, %subreg.vsub_hi, %27, %subreg.vsub_lo
-    %31:hvxvr = V6_vd0
-    %3:hvxwr = REG_SEQUENCE %31, %subreg.vsub_hi, %31, %subreg.vsub_lo
-    %32:intregs = A2_tfrsi 32
-    %4:hvxvr = V6_lvsplath %32
-    %5:intregs = A2_add %18, %0
-    %6:intregs = A2_add %20, %0
-    %7:intregs = A2_add %19, %0
-    %40:intregs = A2_tfrsi 7
-    %49:intregslow8 = A2_tfrsi 4
-    %52:intregs = A2_sub %1, %0
-    %53:intregs = A2_addi %52, 127
-    %54:intregs = S2_lsr_i_r %53, 7
-    %55:intregs = COPY %54
-    %56:intregs = S2_lsr_i_r %55, 1
-    %57:intregs = A2_andir %55, 1
-    %97:predregs = C2_cmpgtui %56, 0
-    J2_jumpf %97, %bb.6, implicit-def $pc
-    J2_jump %bb.4, implicit-def $pc
-
-  bb.4:
-    successors: %bb.5(0x80000000)
-
-    J2_loop0r %bb.5, %56, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
-    J2_jump %bb.5, implicit-def $pc
-
-  bb.5:
-    successors: %bb.5(0x40000000), %bb.6(0x40000000)
-
-    %58:intregs = PHI %7, %bb.4, %80, %bb.5
-    %59:intregs = PHI %6, %bb.4, %82, %bb.5
-    %60:intregs = PHI %5, %bb.4, %93, %bb.5
-    %61:hvxvr, %62:intregs = V6_vL32b_pi %58, 128 :: (load (s1024) from %ir.lsr.iv6)
-    %63:hvxvr, %64:intregs = V6_vL32b_pi %59, 128 :: (load (s1024) from %ir.lsr.iv3)
-    %65:hvxwr = REG_SEQUENCE %63, %subreg.vsub_hi, %61, %subreg.vsub_lo
-    %66:hvxwr = V6_vmpabusv %2, %65
-    %67:hvxwr = V6_vmpabusv %3, %65
-    %68:hvxvr = V6_vasrh %67.vsub_hi, %40
-    %69:hvxvr = V6_vavgh %66.vsub_hi, %68
-    %70:hvxvr = V6_vasrh %67.vsub_lo, %40
-    %71:hvxvr = V6_vavgh %66.vsub_lo, %70
-    %72:hvxvr = V6_vaddhsat %69, %4
-    %73:hvxvr = V6_vaddhsat %71, %4
-    %74:hvxvr = V6_vasrhbsat %72, %73, %49
-    %75:intregs = V6_vS32b_pi %60, 128, %74 :: (store (s1024) into %ir.lsr.iv)
-    %79:hvxvr, %80:intregs = V6_vL32b_pi %62, 128 :: (load (s1024) from %ir.lsr.iv6 + 128)
-    %81:hvxvr, %82:intregs = V6_vL32b_pi %64, 128 :: (load (s1024) from %ir.lsr.iv3 + 128)
-    %83:hvxwr = REG_SEQUENCE %81, %subreg.vsub_hi, %79, %subreg.vsub_lo
-    %84:hvxwr = V6_vmpabusv %2, %83
-    %85:hvxwr = V6_vmpabusv %3, %83
-    %86:hvxvr = V6_vasrh %85.vsub_hi, %40
-    %87:hvxvr = V6_vavgh %84.vsub_hi, %86
-    %88:hvxvr = V6_vasrh %85.vsub_lo, %40
-    %89:hvxvr = V6_vavgh %84.vsub_lo, %88
-    %90:hvxvr = V6_vaddhsat %87, %4
-    %91:hvxvr = V6_vaddhsat %89, %4
-    %92:hvxvr = V6_vasrhbsat %90, %91, %49
-    %93:intregs = V6_vS32b_pi %75, 128, %92 :: (store (s1024) into %ir.lsr.iv + 128)
-    ENDLOOP0 %bb.5, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
-    J2_jump %bb.6, implicit-def $pc
-
-  bb.6:
-    successors: %bb.7(0x40000000), %bb.8(0x40000000)
-
-    %94:intregs = PHI %7, %bb.1, %80, %bb.5
-    %95:intregs = PHI %6, %bb.1, %82, %bb.5
-    %96:intregs = PHI %5, %bb.1, %93, %bb.5
-    %98:predregs = C2_cmpgtui %57, 0
-    J2_jumpf %98, %bb.8, implicit-def $pc
-    J2_jump %bb.7, implicit-def $pc
-
-  bb.7:
-    successors: %bb.2(0x80000000)
-
-    J2_jump %bb.2, implicit-def $pc
-
-  bb.2.for.body.i (machine-block-address-taken):
-    successors: %bb.8(0x04000000)
-
-    %33:hvxvr, %15:intregs = V6_vL32b_pi %94, 128 :: (load (s1024) from %ir.lsr.iv6)
-    %34:hvxvr, %14:intregs = V6_vL32b_pi %95, 128 :: (load (s1024) from %ir.lsr.iv3)
-    %35:hvxwr = REG_SEQUENCE %34, %subreg.vsub_hi, %33, %subreg.vsub_lo
-    %36:hvxwr = V6_vmpabusv %2, %35
-    %37:hvxwr = V6_vmpabusv %3, %35
-    %41:hvxvr = V6_vasrh %37.vsub_hi, %40
-    %42:hvxvr = V6_vavgh %36.vsub_hi, %41
-    %45:hvxvr = V6_vasrh %37.vsub_lo, %40
-    %46:hvxvr = V6_vavgh %36.vsub_lo, %45
-    %47:hvxvr = V6_vaddhsat %42, %4
-    %48:hvxvr = V6_vaddhsat %46, %4
-    %50:hvxvr = V6_vasrhbsat %47, %48, %49
-    %13:intregs = V6_vS32b_pi %96, 128, %50 :: (store (s1024) into %ir.lsr.iv)
-    J2_jump %bb.8, implicit-def $pc
-
-  bb.8:
-    successors: %bb.3(0x80000000)
-
-    J2_jump %bb.3, implicit-def $pc
-
-  bb.3.for.end.i:
-    PS_jmpret $r31, implicit-def dead $pc
-
-...
diff --git a/llvm/test/CodeGen/Hexagon/post_inc_store.mir b/llvm/test/CodeGen/Hexagon/post_inc_store.mir
deleted file mode 100644
index 3e3f51ac9114..000000000000
--- a/llvm/test/CodeGen/Hexagon/post_inc_store.mir
+++ /dev/null
@@ -1,168 +0,0 @@
-#RUN: llc -march=hexagon -run-pass hexagon-postincopt %s -o - | FileCheck  %s
-
-# Test that we convert a post-inc load and store to a regular load and post-inc
-# store.
-# CHECK: J2_loop0r
-# CHECK-NOT: = L2_loadruh_pi
-# CHECK: L2_loadruh_io
-# CHECK: S2_storerh_pi
-# CHECK: ENDLOOP0
-
---- |
-  ; Function Attrs: nofree norecurse nounwind
-  define dso_local void @blam(i32 %arg, ptr nocapture %arg1, i16 signext %arg2) local_unnamed_addr #0 {
-  bb:
-    %icmp = icmp eq i32 %arg, 0
-    br i1 %icmp, label %bb13, label %bb3
-
-  bb3:                                              ; preds = %bb, %bb10
-    %phi = phi i32 [ %add11, %bb10 ], [ 0, %bb ]
-    %mul = mul i32 %phi, %arg
-    %cgep = getelementptr i16, ptr %arg1, i32 %mul
-    br label %bb4
-
-  bb4:                                              ; preds = %bb4, %bb3
-    %lsr.iv = phi i32 [ %lsr.iv.next, %bb4 ], [ %arg, %bb3 ]
-    %phi5 = phi ptr [ %cgep, %bb3 ], [ %cgep1, %bb4 ]
-    %load = load i16, ptr %phi5, align 2
-    %add = add i16 %load, %arg2
-    store i16 %add, ptr %phi5, align 2
-    %lsr.iv.next = add i32 %lsr.iv, -1
-    %icmp8 = icmp eq i32 %lsr.iv.next, 0
-    %cgep1 = getelementptr i16, ptr %phi5, i32 1
-    br i1 %icmp8, label %bb10, label %bb4
-
-  bb10:                                             ; preds = %bb4
-    %add11 = add nuw i32 %phi, 1
-    %icmp12 = icmp eq i32 %add11, %arg
-    br i1 %icmp12, label %bb13, label %bb3
-
-  bb13:                                             ; preds = %bb10, %bb
-    ret void
-  }
-
-  attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv68" "target-features"="+v68,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-...
----
-name:            blam
-alignment:       16
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHCatchret:   false
-hasEHScopes:     false
-hasEHFunclets:   false
-isOutlined:      false
-debugInstrRef:   false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
-  - { id: 0, class: intregs, preferred-register: '' }
-  - { id: 1, class: intregs, preferred-register: '' }
-  - { id: 2, class: intregs, preferred-register: '' }
-  - { id: 3, class: intregs, preferred-register: '' }
-  - { id: 4, class: intregs, preferred-register: '' }
-  - { id: 5, class: intregs, preferred-register: '' }
-  - { id: 6, class: intregs, preferred-register: '' }
-  - { id: 7, class: intregs, preferred-register: '' }
-  - { id: 8, class: intregs, preferred-register: '' }
-  - { id: 9, class: intregs, preferred-register: '' }
-  - { id: 10, class: intregs, preferred-register: '' }
-  - { id: 11, class: intregs, preferred-register: '' }
-  - { id: 12, class: predregs, preferred-register: '' }
-  - { id: 13, class: intregs, preferred-register: '' }
-  - { id: 14, class: intregs, preferred-register: '' }
-  - { id: 15, class: intregs, preferred-register: '' }
-  - { id: 16, class: predregs, preferred-register: '' }
-  - { id: 17, class: predregs, preferred-register: '' }
-  - { id: 18, class: predregs, preferred-register: '' }
-  - { id: 19, class: predregs, preferred-register: '' }
-  - { id: 20, class: intregs, preferred-register: '' }
-  - { id: 21, class: intregs, preferred-register: '' }
-liveins:
-  - { reg: '$r0', virtual-reg: '%7' }
-  - { reg: '$r1', virtual-reg: '%8' }
-  - { reg: '$r2', virtual-reg: '%9' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    1
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:      []
-stack:           []
-entry_values:    []
-callSites:       []
-debugValueSubstitutions: []
-constants:       []
-machineFunctionInfo: {}
-body:             |
-  bb.0.bb:
-    successors: %bb.4(0x30000000), %bb.5(0x50000000)
-    liveins: $r0, $r1, $r2
-
-    %9:intregs = COPY $r2
-    %8:intregs = COPY $r1
-    %7:intregs = COPY $r0
-    %21:intregs = COPY %7
-    %20:intregs = COPY %7
-    %12:predregs = C2_cmpeqi %7, 0
-    J2_jumpt %12, %bb.4, implicit-def $pc
-
-  bb.5:
-    successors: %bb.1(0x80000000)
-
-    %11:intregs = A2_tfrsi 0
-    J2_loop1r %bb.1, %21, implicit-def $lc1, implicit-def $sa1
-
-  bb.1.bb3 (machine-block-address-taken):
-    successors: %bb.2(0x80000000)
-
-    %0:intregs = PHI %11, %bb.5, %6, %bb.3
-    %13:intregs = M2_mpyi %0, %7
-    %1:intregs = S2_addasl_rrri %8, %13, 1
-    J2_loop0r %bb.2, %20, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
-
-  bb.2.bb4 (machine-block-address-taken):
-    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
-
-    %3:intregs = PHI %1, %bb.1, %5, %bb.2
-    %14:intregs = L2_loadruh_io %3, 0 :: (load (s16) from %ir.phi5)
-    %15:intregs = A2_add %14, %9
-    %5:intregs = S2_storerh_pi %3, 2, %15 :: (store (s16) into %ir.phi5)
-    ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
-    J2_jump %bb.3, implicit-def dead $pc
-
-  bb.3.bb10:
-    successors: %bb.4(0x04000000), %bb.1(0x7c000000)
-
-    %6:intregs = nuw A2_addi %0, 1
-    ENDLOOP1 %bb.1, implicit-def $pc, implicit-def $lc1, implicit $sa1, implicit $lc1
-    J2_jump %bb.4, implicit-def dead $pc
-
-  bb.4.bb13:
-    PS_jmpret $r31, implicit-def dead $pc
-
-...
diff --git a/llvm/test/CodeGen/Hexagon/postincopt-crash.mir b/llvm/test/CodeGen/Hexagon/postincopt-crash.mir
deleted file mode 100644
index e22053421791..000000000000
--- a/llvm/test/CodeGen/Hexagon/postincopt-crash.mir
+++ /dev/null
@@ -1,58 +0,0 @@
-# RUN: llc -march=hexagon -run-pass=hexagon-postincopt %s -o /dev/null
-# REQUIRES: asserts
-# Test that we do not hit unreachable code dealt with L4_ior_memoph_io.
-
-...
----
-name:            foo
-alignment:       4
-tracksRegLiveness: true
-body:             |
-  bb.0.entry:
-    successors: %bb.4(0x30000000), %bb.5(0x50000000)
-    liveins: $r0, $r1, $r2
-
-    %9:intregs = COPY $r2
-    %8:intregs = COPY $r1
-    %7:intregs = COPY $r0
-    %21:intregs = COPY %7
-    %20:intregs = COPY %7
-    %12:predregs = C2_cmpeqi %7, 0
-    J2_jumpt %12, %bb.4, implicit-def $pc
-
-  bb.5:
-    successors: %bb.1(0x80000000)
-
-    %11:intregs = A2_tfrsi 0
-    J2_loop1r %bb.1, %21, implicit-def $lc1, implicit-def $sa1
-
-  bb.1:
-    successors: %bb.2(0x80000000)
-
-    %0:intregs = PHI %11, %bb.5, %6, %bb.3
-    %13:intregs = M2_mpyi %0, %7
-    %1:intregs = S2_addasl_rrri %8, %13, 1
-    J2_loop0r %bb.2, %20, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
-
-  bb.2:
-    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
-
-    %3:intregs = PHI %1, %bb.1, %5, %bb.2
-    %14:intregs = L2_loadruh_io %3, 0
-    L4_ior_memoph_io %3:intregs, 0, 21
-    %15:intregs = A2_add %14, %9
-    %5:intregs = S2_storerh_pi %3, 2, %15
-    ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
-    J2_jump %bb.3, implicit-def dead $pc
-
-  bb.3:
-    successors: %bb.4(0x04000000), %bb.1(0x7c000000)
-
-    %6:intregs = nuw A2_addi %0, 1
-    ENDLOOP1 %bb.1, implicit-def $pc, implicit-def $lc1, implicit $sa1, implicit $lc1
-    J2_jump %bb.4, implicit-def dead $pc
-
-  bb.4:
-    PS_jmpret $r31, implicit-def dead $pc
-
-...
diff --git a/llvm/test/CodeGen/Hexagon/postincopt-dcfetch.mir b/llvm/test/CodeGen/Hexagon/postincopt-dcfetch.mir
deleted file mode 100644
index 27d653c99f7b..000000000000
--- a/llvm/test/CodeGen/Hexagon/postincopt-dcfetch.mir
+++ /dev/null
@@ -1,19 +0,0 @@
-# RUN: llc -march=hexagon -run-pass hexagon-postincopt %s -o - | FileCheck %s
-# Check that this doesn't crash.
-# CHECK: Y2_dcfetchbo
-
-name: fred
-tracksRegLiveness: true
-body: |
-  bb.0:
-    successors: %bb.1
-    %0:intregs = IMPLICIT_DEF
-
-  bb.1:
-    successors: %bb.1
-
-    %1:intregs = PHI %0:intregs, %bb.0, %2:intregs, %bb.1
-    Y2_dcfetchbo %1:intregs, 0
-    %2:intregs = A2_addi %1:intregs, 1
-    J2_jump %bb.1, implicit-def dead $pc
-...
diff --git a/llvm/test/CodeGen/Hexagon/valid-offset-loadbsw4.mir b/llvm/test/CodeGen/Hexagon/valid-offset-loadbsw4.mir
deleted file mode 100644
index fca42d547dfb..000000000000
--- a/llvm/test/CodeGen/Hexagon/valid-offset-loadbsw4.mir
+++ /dev/null
@@ -1,32 +0,0 @@
-# RUN: llc -march=hexagon -run-pass hexagon-postincopt -o - %s | FileCheck %s
-# REQUIRES: asserts
-
-# Check that this doesn't crash:
-# CHECK: L2_loadbsw4_io
-
----
-name: fred
-tracksRegLiveness: true
-liveins:
-  - { reg: '$r0', virtual-reg: '%0' }
-body: |
-  bb.0:
-    successors: %bb.1(0x80000000)
-    liveins: $r0
-
-    %0:intregs = COPY $r0
-    %1:intregs = A2_tfrsi 240
-    %2:doubleregs = IMPLICIT_DEF
-    %3:doubleregs = IMPLICIT_DEF
-
-  bb.1:
-    successors: %bb.1(0x80000000)
-
-    %4:intregs = PHI %1, %bb.0, %5, %bb.1
-    %6:doubleregs = L2_loadbsw4_io %4, 0
-    %7:doubleregs = M2_vrmac_s0 %2, %6, %3
-    S2_storeri_io %0, 0, %7.isub_lo
-    %5:intregs = nuw A2_addi %4, 256
-    J2_jump %bb.1, implicit-def dead $pc
-
-...
diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-permi.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-permi.ll
index 0d9f9daabc44..92669d2e5895 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-permi.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-permi.ll
@@ -36,3 +36,33 @@ entry:
   %res = call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> %va, <32 x i8> %vb, i32 1)
   ret <32 x i8> %res
 }
+
+define <32 x i8> @lasx_xvpermi_q_204(<32 x i8> %va, <32 x i8> %vb) nounwind {
+; CHECK-LABEL: lasx_xvpermi_q_204:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> %va, <32 x i8> %vb, i32 204)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @lasx_xvpermi_q_221(<32 x i8> %va, <32 x i8> %vb) nounwind {
+; CHECK-LABEL: lasx_xvpermi_q_221:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 17
+; CHECK-NEXT:    ret
+entry:
+  %res = call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> %va, <32 x i8> %vb, i32 221)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @lasx_xvpermi_q_255(<32 x i8> %va, <32 x i8> %vb) nounwind {
+; CHECK-LABEL: lasx_xvpermi_q_255:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 51
+; CHECK-NEXT:    ret
+entry:
+  %res = call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> %va, <32 x i8> %vb, i32 255)
+  ret <32 x i8> %res
+}
diff --git a/llvm/test/CodeGen/Mips/inlineasm-constraints-softfloat.ll b/llvm/test/CodeGen/Mips/inlineasm-constraints-softfloat.ll
new file mode 100644
index 000000000000..705570f808ce
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/inlineasm-constraints-softfloat.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -march=mips < %s | FileCheck %s --check-prefix=MIPS32
+; RUN: llc -march=mips64 < %s | FileCheck %s --check-prefix=MIPS64
+
+define dso_local void @read_double(ptr nocapture noundef readonly %0) local_unnamed_addr #0 {
+; MIPS32-LABEL: read_double:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    lw $2, 4($4)
+; MIPS32-NEXT:    lw $3, 0($4)
+; MIPS32-NEXT:    #APP
+; MIPS32-NEXT:    #NO_APP
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+;
+; MIPS64-LABEL: read_double:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    ld $2, 0($4)
+; MIPS64-NEXT:    #APP
+; MIPS64-NEXT:    #NO_APP
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    nop
+  %2 = load double, ptr %0, align 8
+  tail call void asm sideeffect "", "r,~{$1}"(double %2)
+  ret void
+}
+
+define dso_local void @read_float(ptr nocapture noundef readonly %0) local_unnamed_addr #0 {
+; MIPS32-LABEL: read_float:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    lw $2, 0($4)
+; MIPS32-NEXT:    #APP
+; MIPS32-NEXT:    #NO_APP
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+;
+; MIPS64-LABEL: read_float:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    lw $2, 0($4)
+; MIPS64-NEXT:    #APP
+; MIPS64-NEXT:    #NO_APP
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    nop
+  %2 = load float, ptr %0, align 8
+  tail call void asm sideeffect "", "r,~{$1}"(float %2)
+  ret void
+}
+
+attributes #0 = { "target-features"="+soft-float" "use-soft-float"="true" }
diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
index 3880ac9f4ec6..9e93ad0043a7 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
@@ -2812,7 +2812,6 @@ define i16 @utesth_f16i16_mm(half %x) {
 ; RV64-NEXT:    .cfi_offset ra, -8
 ; RV64-NEXT:    call __extendhfsf2
 ; RV64-NEXT:    fcvt.lu.s a0, fa0, rtz
-; RV64-NEXT:    sext.w a0, a0
 ; RV64-NEXT:    lui a1, 16
 ; RV64-NEXT:    addiw a1, a1, -1
 ; RV64-NEXT:    bltu a0, a1, .LBB43_2
diff --git a/llvm/test/CodeGen/RISCV/misched-postra-direction.mir b/llvm/test/CodeGen/RISCV/misched-postra-direction.mir
new file mode 100644
index 000000000000..841d0e6d65da
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/misched-postra-direction.mir
@@ -0,0 +1,53 @@
+# RUN: llc -mtriple=riscv64 -mcpu=sifive-x280 -run-pass=postmisched -enable-post-misched -debug-only=machine-scheduler -misched-dump-schedule-trace -misched-postra-direction=topdown -o - %s 2>&1 | FileCheck --check-prefix=TOPDOWN %s
+# RUN: llc -mtriple=riscv64 -mcpu=sifive-x280 -run-pass=postmisched -enable-post-misched -debug-only=machine-scheduler -misched-dump-schedule-trace -misched-postra-direction=bottomup -o - %s 2>&1 | FileCheck --check-prefix=BOTTOMUP %s
+
+# REQUIRES: asserts
+
+---
+name:            test
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    renamable $x12 = MUL renamable $x11, renamable $x10
+    renamable $x13 = ADD renamable $x11, renamable $x10
+    renamable $x14 = DIVW killed renamable $x12, killed renamable $x13
+    PseudoRET implicit $x14
+...
+
+# TOPDOWN: *** Final schedule for %bb.0 ***
+# TOPDOWN-NEXT:  * Schedule table (TopDown):
+# TOPDOWN-NEXT:   i: issue
+# TOPDOWN-NEXT:   x: resource booked
+# TOPDOWN-NEXT: Cycle              | 0  | 1  | 2  | 3  | 4  | 5  | 6  | 7  | 8  | 9  | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 |
+# TOPDOWN-NEXT: SU(0)              | i  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# TOPDOWN-NEXT:      SiFive7PipeAB | x  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# TOPDOWN-NEXT:       SiFive7PipeB | x  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# TOPDOWN-NEXT: SU(1)              | i  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# TOPDOWN-NEXT:      SiFive7PipeAB | x  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# TOPDOWN-NEXT: SU(2)              |    |    |    | i  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# TOPDOWN-NEXT:      SiFive7PipeAB |    |    |    | x  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# TOPDOWN-NEXT:       SiFive7PipeB |    |    |    | x  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# TOPDOWN-NEXT:        SiFive7IDiv |    |    |    | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  |
+# TOPDOWN-NEXT: SU(0):   renamable $x12 = MUL renamable $x11, renamable $x10
+# TOPDOWN-NEXT: SU(1):   renamable $x13 = ADD renamable $x11, renamable $x10
+# TOPDOWN-NEXT: SU(2):   renamable $x14 = DIVW renamable $x12, renamable $x13
+
+# BOTTOMUP: *** Final schedule for %bb.0 ***
+# BOTTOMUP-NEXT:  * Schedule table (BottomUp):
+# BOTTOMUP-NEXT:   i: issue
+# BOTTOMUP-NEXT:   x: resource booked
+# BOTTOMUP-NEXT: Cycle              | 37 | 36 | 35 | 34 | 33 | 32 | 31 | 30 | 29 | 28 | 27 | 26 | 25 | 24 | 23 | 22 | 21 | 20 | 19 | 18 | 17 | 16 | 15 | 14 | 13 | 12 | 11 | 10 | 9  | 8  | 7  | 6  | 5  | 4  | 3  | 2  |
+# BOTTOMUP-NEXT: SU(1)              | i  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# BOTTOMUP-NEXT:      SiFive7PipeAB | x  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# BOTTOMUP-NEXT: SU(0)              | i  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# BOTTOMUP-NEXT:      SiFive7PipeAB | x  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# BOTTOMUP-NEXT:       SiFive7PipeB | x  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# BOTTOMUP-NEXT: SU(2)              |    |    |    | i  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# BOTTOMUP-NEXT:      SiFive7PipeAB |    |    |    | x  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# BOTTOMUP-NEXT:       SiFive7PipeB |    |    |    | x  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# BOTTOMUP-NEXT:        SiFive7IDiv |    |    |    | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  |
+# BOTTOMUP-NEXT: SU(1):   renamable $x13 = ADD renamable $x11, renamable $x10
+# BOTTOMUP-NEXT: SU(0):   renamable $x12 = MUL renamable $x11, renamable $x10
+# BOTTOMUP-NEXT: SU(2):   renamable $x14 = DIVW renamable $x12, renamable $x13
diff --git a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll
index 1fe91c721f4d..1d025a2f776f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll
@@ -18,15 +18,15 @@ define void @test(ptr %addr) {
 ; CHECK-NEXT:    add a2, a0, a1
 ; CHECK-NEXT:    vl1re64.v v8, (a2)
 ; CHECK-NEXT:    slli a2, a1, 1
-; CHECK-NEXT:    vl1re64.v v9, (a0)
-; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    vl1re64.v v9, (a3)
 ; CHECK-NEXT:    vl1re64.v v10, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs1r.v v9, (a0)
 ; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vs1r.v v10, (a2)
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vs1r.v v8, (a0)
+; CHECK-NEXT:    vs1r.v v9, (a2)
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    vs1r.v v8, (a1)
+; CHECK-NEXT:    vs1r.v v10, (a0)
 ; CHECK-NEXT:    csrrs a0, vlenb, zero
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll
index a9a680d54d58..64031f8a9359 100644
--- a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll
@@ -16,13 +16,13 @@ define <vscale x 1 x double> @test(ptr %addr, i64 %vl) {
 ; CHECK-NEXT:    sub sp, sp, a2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; CHECK-NEXT:    csrrs a2, vlenb, zero
-; CHECK-NEXT:    vl1re64.v v8, (a0)
-; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    vl1re64.v v8, (a3)
 ; CHECK-NEXT:    vl1re64.v v9, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs1r.v v8, (a0)
 ; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vs1r.v v9, (a2)
+; CHECK-NEXT:    vs1r.v v8, (a2)
+; CHECK-NEXT:    vs1r.v v9, (a0)
 ; CHECK-NEXT:    vl1re64.v v8, (a2)
 ; CHECK-NEXT:    vl1re64.v v9, (a0)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll
index 3a73f1729ded..c310274d6850 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll
@@ -606,7 +606,7 @@ define <vscale x 16 x i1> @ctpop_nxv16i32_ult_two(<vscale x 16 x i32> %va) {
 ; CHECK-ZVBB-NEXT:    vmsleu.vi v0, v8, 1
 ; CHECK-ZVBB-NEXT:    ret
   %a = call <vscale x 16 x i32> @llvm.ctpop.nxv16i32(<vscale x 16 x i32> %va)
-  %cmp = icmp ult <vscale x 16 x i32> %a, shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 2, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+  %cmp = icmp ult <vscale x 16 x i32> %a, splat (i32 2)
   ret <vscale x 16 x i1> %cmp
 }
 
@@ -626,7 +626,7 @@ define <vscale x 16 x i1> @ctpop_nxv16i32_ugt_one(<vscale x 16 x i32> %va) {
 ; CHECK-ZVBB-NEXT:    vmsgtu.vi v0, v8, 1
 ; CHECK-ZVBB-NEXT:    ret
   %a = call <vscale x 16 x i32> @llvm.ctpop.nxv16i32(<vscale x 16 x i32> %va)
-  %cmp = icmp ugt <vscale x 16 x i32> %a, shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+  %cmp = icmp ugt <vscale x 16 x i32> %a, splat (i32 1)
   ret <vscale x 16 x i1> %cmp
 }
 
@@ -646,7 +646,7 @@ define <vscale x 16 x i1> @ctpop_nxv16i32_eq_one(<vscale x 16 x i32> %va) {
 ; CHECK-ZVBB-NEXT:    vmseq.vi v0, v8, 1
 ; CHECK-ZVBB-NEXT:    ret
   %a = call <vscale x 16 x i32> @llvm.ctpop.nxv16i32(<vscale x 16 x i32> %va)
-  %cmp = icmp eq <vscale x 16 x i32> %a, shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+  %cmp = icmp eq <vscale x 16 x i32> %a, splat (i32 1)
   ret <vscale x 16 x i1> %cmp
 }
 
@@ -666,7 +666,7 @@ define <vscale x 16 x i1> @ctpop_nxv16i32_ne_one(<vscale x 16 x i32> %va) {
 ; CHECK-ZVBB-NEXT:    vmsne.vi v0, v8, 1
 ; CHECK-ZVBB-NEXT:    ret
   %a = call <vscale x 16 x i32> @llvm.ctpop.nxv16i32(<vscale x 16 x i32> %va)
-  %cmp = icmp ne <vscale x 16 x i32> %a, shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+  %cmp = icmp ne <vscale x 16 x i32> %a, splat (i32 1)
   ret <vscale x 16 x i1> %cmp
 }
 
@@ -1020,7 +1020,7 @@ define <vscale x 8 x i1> @ctpop_nxv8i64_ult_two(<vscale x 8 x i64> %va) {
 ; CHECK-ZVBB-NEXT:    vmsleu.vi v0, v8, 1
 ; CHECK-ZVBB-NEXT:    ret
   %a = call <vscale x 8 x i64> @llvm.ctpop.nxv8i64(<vscale x 8 x i64> %va)
-  %cmp = icmp ult <vscale x 8 x i64> %a, shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 2, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+  %cmp = icmp ult <vscale x 8 x i64> %a, splat (i64 2)
   ret <vscale x 8 x i1> %cmp
 }
 
@@ -1040,7 +1040,7 @@ define <vscale x 8 x i1> @ctpop_nxv8i64_ugt_one(<vscale x 8 x i64> %va) {
 ; CHECK-ZVBB-NEXT:    vmsgtu.vi v0, v8, 1
 ; CHECK-ZVBB-NEXT:    ret
   %a = call <vscale x 8 x i64> @llvm.ctpop.nxv8i64(<vscale x 8 x i64> %va)
-  %cmp = icmp ugt <vscale x 8 x i64> %a, shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+  %cmp = icmp ugt <vscale x 8 x i64> %a, splat (i64 1)
   ret <vscale x 8 x i1> %cmp
 }
 
@@ -1060,7 +1060,7 @@ define <vscale x 8 x i1> @ctpop_nxv8i64_eq_one(<vscale x 8 x i64> %va) {
 ; CHECK-ZVBB-NEXT:    vmseq.vi v0, v8, 1
 ; CHECK-ZVBB-NEXT:    ret
   %a = call <vscale x 8 x i64> @llvm.ctpop.nxv8i64(<vscale x 8 x i64> %va)
-  %cmp = icmp eq <vscale x 8 x i64> %a, shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+  %cmp = icmp eq <vscale x 8 x i64> %a, splat (i64 1)
   ret <vscale x 8 x i1> %cmp
 }
 
@@ -1080,7 +1080,7 @@ define <vscale x 8 x i1> @ctpop_nxv8i64_ne_one(<vscale x 8 x i64> %va) {
 ; CHECK-ZVBB-NEXT:    vmsne.vi v0, v8, 1
 ; CHECK-ZVBB-NEXT:    ret
   %a = call <vscale x 8 x i64> @llvm.ctpop.nxv8i64(<vscale x 8 x i64> %va)
-  %cmp = icmp ne <vscale x 8 x i64> %a, shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+  %cmp = icmp ne <vscale x 8 x i64> %a, splat (i64 1)
   ret <vscale x 8 x i1> %cmp
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
index a2d02b6bb641..76aa2b913c65 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
@@ -474,7 +474,7 @@ define <vscale x 6 x half> @extract_nxv6f16_nxv12f16_6(<vscale x 12 x half> %in)
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v12, v9, a0
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v12, v10, a0
 ; CHECK-NEXT:    vmv2r.v v8, v12
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
index c49b1a7ad186..b9c611bf3e54 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
@@ -1,6 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v -riscv-v-vector-bits-max=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-KNOWNVLEN128
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA
+
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA
+
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
 
 define void @extract_v2i8_v4i8_0(ptr %x, ptr %y) {
 ; CHECK-LABEL: extract_v2i8_v4i8_0:
@@ -63,22 +69,22 @@ define void @extract_v2i8_v8i8_6(ptr %x, ptr %y) {
 }
 
 define void @extract_v1i32_v8i32_4(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v1i32_v8i32_4:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vle32.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 4
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v1i32_v8i32_4:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 4
+; VLA-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v1i32_v8i32_4:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v9, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v1i32_v8i32_4:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v9, (a1)
+; VLS-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <1 x i32> @llvm.vector.extract.v1i32.v8i32(<8 x i32> %a, i64 4)
   store <1 x i32> %c, ptr %y
@@ -86,24 +92,24 @@ define void @extract_v1i32_v8i32_4(ptr %x, ptr %y) {
 }
 
 define void @extract_v1i32_v8i32_5(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v1i32_v8i32_5:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vle32.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 5
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v1i32_v8i32_5:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 5
+; VLA-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v1i32_v8i32_5:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v9, 1
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v1i32_v8i32_5:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v9, 1
+; VLS-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <1 x i32> @llvm.vector.extract.v1i32.v8i32(<8 x i32> %a, i64 5)
   store <1 x i32> %c, ptr %y
@@ -111,20 +117,20 @@ define void @extract_v1i32_v8i32_5(ptr %x, ptr %y) {
 }
 
 define void @extract_v2i32_v8i32_0(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_v8i32_0:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vle32.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_v8i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_v8i32_0:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_v8i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> %a, i64 0)
   store <2 x i32> %c, ptr %y
@@ -132,24 +138,24 @@ define void @extract_v2i32_v8i32_0(ptr %x, ptr %y) {
 }
 
 define void @extract_v2i32_v8i32_2(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_v8i32_2:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vle32.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_v8i32_2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 2
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_v8i32_2:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_v8i32_2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v8, 2
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> %a, i64 2)
   store <2 x i32> %c, ptr %y
@@ -157,22 +163,22 @@ define void @extract_v2i32_v8i32_2(ptr %x, ptr %y) {
 }
 
 define void @extract_v2i32_v8i32_4(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_v8i32_4:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vle32.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 4
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_v8i32_4:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 4
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_v8i32_4:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v9, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_v8i32_4:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v9, (a1)
+; VLS-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> %a, i64 4)
   store <2 x i32> %c, ptr %y
@@ -180,24 +186,24 @@ define void @extract_v2i32_v8i32_4(ptr %x, ptr %y) {
 }
 
 define void @extract_v2i32_v8i32_6(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_v8i32_6:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vle32.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 6
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_v8i32_6:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 6
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_v8i32_6:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v9, 2
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_v8i32_6:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v9, 2
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> %a, i64 6)
   store <2 x i32> %c, ptr %y
@@ -230,59 +236,59 @@ define void @extract_v2i32_nxv16i32_2(<vscale x 16 x i32> %x, ptr %y) {
 }
 
 define void @extract_v2i32_nxv16i32_4(<vscale x 16 x i32> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_nxv16i32_4:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 4
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_nxv16i32_4:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 4
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_nxv16i32_4:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v9, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_nxv16i32_4:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v9, (a0)
+; VLS-NEXT:    ret
   %c = call <2 x i32> @llvm.vector.extract.v2i32.nxv16i32(<vscale x 16 x i32> %x, i64 4)
   store <2 x i32> %c, ptr %y
   ret void
 }
 
 define void @extract_v2i32_nxv16i32_6(<vscale x 16 x i32> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_nxv16i32_6:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 6
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_nxv16i32_6:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 6
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_nxv16i32_6:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v9, 2
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_nxv16i32_6:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v9, 2
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v8, (a0)
+; VLS-NEXT:    ret
   %c = call <2 x i32> @llvm.vector.extract.v2i32.nxv16i32(<vscale x 16 x i32> %x, i64 6)
   store <2 x i32> %c, ptr %y
   ret void
 }
 
 define void @extract_v2i32_nxv16i32_8(<vscale x 16 x i32> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_nxv16i32_8:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, m4, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 8
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_nxv16i32_8:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, m4, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 8
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_nxv16i32_8:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v10, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_nxv16i32_8:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v10, (a0)
+; VLS-NEXT:    ret
   %c = call <2 x i32> @llvm.vector.extract.v2i32.nxv16i32(<vscale x 16 x i32> %x, i64 8)
   store <2 x i32> %c, ptr %y
   ret void
@@ -339,40 +345,40 @@ define void @extract_v2i8_nxv2i8_6(<vscale x 2 x i8> %x, ptr %y) {
 }
 
 define void @extract_v8i32_nxv16i32_8(<vscale x 16 x i32> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v8i32_nxv16i32_8:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 8
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v8i32_nxv16i32_8:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 8
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v8i32_nxv16i32_8:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vs2r.v v10, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v8i32_nxv16i32_8:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vs2r.v v10, (a0)
+; VLS-NEXT:    ret
   %c = call <8 x i32> @llvm.vector.extract.v8i32.nxv16i32(<vscale x 16 x i32> %x, i64 8)
   store <8 x i32> %c, ptr %y
   ret void
 }
 
 define void @extract_v8i1_v64i1_0(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v8i1_v64i1_0:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    li a2, 64
-; CHECK-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vlm.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vsm.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v8i1_v64i1_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 64
+; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vsm.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v8i1_v64i1_0:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vlm.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v8i1_v64i1_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; VLS-NEXT:    vlm.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vsm.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <64 x i1>, ptr %x
   %c = call <8 x i1> @llvm.vector.extract.v8i1.v64i1(<64 x i1> %a, i64 0)
   store <8 x i1> %c, ptr %y
@@ -380,26 +386,26 @@ define void @extract_v8i1_v64i1_0(ptr %x, ptr %y) {
 }
 
 define void @extract_v8i1_v64i1_8(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v8i1_v64i1_8:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    li a2, 64
-; CHECK-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vlm.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 1
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vsm.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v8i1_v64i1_8:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 64
+; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 1
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vsm.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v8i1_v64i1_8:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vlm.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 1
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v8i1_v64i1_8:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; VLS-NEXT:    vlm.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v8, 1
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vsm.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <64 x i1>, ptr %x
   %c = call <8 x i1> @llvm.vector.extract.v8i1.v64i1(<64 x i1> %a, i64 8)
   store <8 x i1> %c, ptr %y
@@ -407,26 +413,26 @@ define void @extract_v8i1_v64i1_8(ptr %x, ptr %y) {
 }
 
 define void @extract_v8i1_v64i1_48(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v8i1_v64i1_48:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    li a2, 64
-; CHECK-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vlm.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 6
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vsm.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v8i1_v64i1_48:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 64
+; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 6
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vsm.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v8i1_v64i1_48:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vlm.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 6
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v8i1_v64i1_48:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; VLS-NEXT:    vlm.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v8, 6
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vsm.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <64 x i1>, ptr %x
   %c = call <8 x i1> @llvm.vector.extract.v8i1.v64i1(<64 x i1> %a, i64 48)
   store <8 x i1> %c, ptr %y
@@ -508,38 +514,38 @@ define void @extract_v8i1_nxv64i1_192(<vscale x 64 x i1> %x, ptr %y) {
 }
 
 define void @extract_v2i1_v64i1_0(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i1_v64i1_0:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    li a2, 64
-; CHECK-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vlm.v v0, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v9, 0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-V-NEXT:    vmv.v.v v9, v8
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-V-NEXT:    vsm.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i1_v64i1_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 64
+; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; VLA-NEXT:    vlm.v v0, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmsne.vi v8, v9, 0
+; VLA-NEXT:    vsm.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_v64i1_0:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vlm.v v0, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i1_v64i1_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; VLS-NEXT:    vlm.v v0, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmsne.vi v8, v9, 0
+; VLS-NEXT:    vsm.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <64 x i1>, ptr %x
   %c = call <2 x i1> @llvm.vector.extract.v2i1.v64i1(<64 x i1> %a, i64 0)
   store <2 x i1> %c, ptr %y
@@ -547,48 +553,48 @@ define void @extract_v2i1_v64i1_0(ptr %x, ptr %y) {
 }
 
 define void @extract_v2i1_v64i1_2(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i1_v64i1_2:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    li a2, 64
-; CHECK-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vlm.v v0, (a0)
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v9, 0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-V-NEXT:    vmv.v.v v9, v8
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-V-NEXT:    vsm.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i1_v64i1_2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 64
+; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; VLA-NEXT:    vlm.v v0, (a0)
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 2
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmsne.vi v0, v8, 0
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmsne.vi v8, v9, 0
+; VLA-NEXT:    vsm.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_v64i1_2:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vlm.v v0, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i1_v64i1_2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; VLS-NEXT:    vlm.v v0, (a0)
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v8, 2
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v8, 0
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmsne.vi v8, v9, 0
+; VLS-NEXT:    vsm.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <64 x i1>, ptr %x
   %c = call <2 x i1> @llvm.vector.extract.v2i1.v64i1(<64 x i1> %a, i64 2)
   store <2 x i1> %c, ptr %y
@@ -596,49 +602,49 @@ define void @extract_v2i1_v64i1_2(ptr %x, ptr %y) {
 }
 
 define void @extract_v2i1_v64i1_42(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i1_v64i1_42:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    li a2, 64
-; CHECK-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vlm.v v0, (a0)
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    li a0, 42
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vslidedown.vx v8, v8, a0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v9, 0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-V-NEXT:    vmv.v.v v9, v8
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-V-NEXT:    vsm.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i1_v64i1_42:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 64
+; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; VLA-NEXT:    vlm.v v0, (a0)
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    li a0, 42
+; VLA-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
+; VLA-NEXT:    vslidedown.vx v8, v8, a0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmsne.vi v0, v8, 0
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmsne.vi v8, v9, 0
+; VLA-NEXT:    vsm.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_v64i1_42:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vlm.v v0, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v10, 10
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i1_v64i1_42:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; VLS-NEXT:    vlm.v v0, (a0)
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v10, 10
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v8, 0
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmsne.vi v8, v9, 0
+; VLS-NEXT:    vsm.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <64 x i1>, ptr %x
   %c = call <2 x i1> @llvm.vector.extract.v2i1.v64i1(<64 x i1> %a, i64 42)
   store <2 x i1> %c, ptr %y
@@ -665,45 +671,45 @@ define void @extract_v2i1_nxv2i1_0(<vscale x 2 x i1> %x, ptr %y) {
 }
 
 define void @extract_v2i1_nxv2i1_2(<vscale x 2 x i1> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i1_nxv2i1_2:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v9, 0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-V-NEXT:    vmv.v.v v9, v8
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-V-NEXT:    vsm.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i1_nxv2i1_2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 2
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmsne.vi v0, v8, 0
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmsne.vi v8, v9, 0
+; VLA-NEXT:    vsm.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_nxv2i1_2:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i1_nxv2i1_2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v8, 2
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v8, 0
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmsne.vi v8, v9, 0
+; VLS-NEXT:    vsm.v v8, (a0)
+; VLS-NEXT:    ret
   %c = call <2 x i1> @llvm.vector.extract.v2i1.nxv2i1(<vscale x 2 x i1> %x, i64 2)
   store <2 x i1> %c, ptr %y
   ret void
@@ -754,91 +760,91 @@ define void @extract_v2i1_nxv64i1_2(<vscale x 64 x i1> %x, ptr %y) {
 }
 
 define void @extract_v2i1_nxv64i1_42(<vscale x 64 x i1> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i1_nxv64i1_42:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    li a1, 42
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vslidedown.vx v8, v8, a1
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v9, 0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-V-NEXT:    vmv.v.v v9, v8
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-V-NEXT:    vsm.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i1_nxv64i1_42:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    li a1, 42
+; VLA-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
+; VLA-NEXT:    vslidedown.vx v8, v8, a1
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmsne.vi v0, v8, 0
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmsne.vi v8, v9, 0
+; VLA-NEXT:    vsm.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_nxv64i1_42:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v10, 10
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i1_nxv64i1_42:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v10, 10
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v8, 0
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmsne.vi v8, v9, 0
+; VLS-NEXT:    vsm.v v8, (a0)
+; VLS-NEXT:    ret
   %c = call <2 x i1> @llvm.vector.extract.v2i1.nxv64i1(<vscale x 64 x i1> %x, i64 42)
   store <2 x i1> %c, ptr %y
   ret void
 }
 
 define void @extract_v2i1_nxv32i1_26(<vscale x 32 x i1> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i1_nxv32i1_26:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 26
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v9, 0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-V-NEXT:    vmv.v.v v9, v8
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-V-NEXT:    vsm.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i1_nxv32i1_26:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 2, e8, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 26
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmsne.vi v0, v8, 0
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmsne.vi v8, v9, 0
+; VLA-NEXT:    vsm.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_nxv32i1_26:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v9, 10
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i1_nxv32i1_26:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v9, 10
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v8, 0
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmsne.vi v8, v9, 0
+; VLS-NEXT:    vsm.v v8, (a0)
+; VLS-NEXT:    ret
   %c = call <2 x i1> @llvm.vector.extract.v2i1.nxv32i1(<vscale x 32 x i1> %x, i64 26)
   store <2 x i1> %c, ptr %y
   ret void
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
index efb1f720f2d0..9f0240c53b21 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
@@ -1,9 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV32VLA
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV64VLA
 
-; RUN: llc -mtriple=riscv32 -mattr=+m,+v -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV32VLA
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV64VLA
+
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS,RV32VLS %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS,RV64VLS %s
 
 define <vscale x 8 x i32> @insert_nxv8i32_v2i32_0(<vscale x 8 x i32> %vec, ptr %svp) {
 ; CHECK-LABEL: insert_nxv8i32_v2i32_0:
@@ -45,26 +48,40 @@ define <vscale x 8 x i32> @insert_nxv8i32_v2i32_6(<vscale x 8 x i32> %vec, ptr %
 }
 
 define <vscale x 8 x i32> @insert_nxv8i32_v8i32_0(<vscale x 8 x i32> %vec, ptr %svp) {
-; CHECK-LABEL: insert_nxv8i32_v8i32_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_nxv8i32_v8i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v12, (a0)
+; VLA-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
+; VLA-NEXT:    vmv.v.v v8, v12
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv8i32_v8i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v12, (a0)
+; VLS-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
+; VLS-NEXT:    vmv.v.v v8, v12
+; VLS-NEXT:    ret
   %sv = load <8 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 0)
   ret <vscale x 8 x i32> %v
 }
 
 define <vscale x 8 x i32> @insert_nxv8i32_v8i32_8(<vscale x 8 x i32> %vec, ptr %svp) {
-; CHECK-LABEL: insert_nxv8i32_v8i32_8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 8
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_nxv8i32_v8i32_8:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v12, (a0)
+; VLA-NEXT:    vsetivli zero, 16, e32, m4, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 8
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv8i32_v8i32_8:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v12, (a0)
+; VLS-NEXT:    vsetivli zero, 16, e32, m4, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v12, 8
+; VLS-NEXT:    ret
   %sv = load <8 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 8)
   ret <vscale x 8 x i32> %v
@@ -82,17 +99,27 @@ define <vscale x 8 x i32> @insert_nxv8i32_undef_v2i32_0(ptr %svp) {
 }
 
 define void @insert_v4i32_v2i32_0(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v4i32_v2i32_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
-; CHECK-NEXT:    vmv.v.v v9, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vse32.v v9, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v4i32_v2i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vle32.v v9, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vse32.v v9, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v4i32_v2i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vl1re32.v v9, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vs1r.v v9, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <4 x i32>, ptr %vp
   %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> %vec, <2 x i32> %sv, i64 0)
@@ -101,15 +128,25 @@ define void @insert_v4i32_v2i32_0(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v4i32_v2i32_2(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v4i32_v2i32_2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vslideup.vi v9, v8, 2
-; CHECK-NEXT:    vse32.v v9, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v4i32_v2i32_2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vle32.v v9, (a0)
+; VLA-NEXT:    vslideup.vi v9, v8, 2
+; VLA-NEXT:    vse32.v v9, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v4i32_v2i32_2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vl1re32.v v9, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v9, v8, 2
+; VLS-NEXT:    vs1r.v v9, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <4 x i32>, ptr %vp
   %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> %vec, <2 x i32> %sv, i64 2)
@@ -118,13 +155,20 @@ define void @insert_v4i32_v2i32_2(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v4i32_undef_v2i32_0(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v4i32_undef_v2i32_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v4i32_undef_v2i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vse32.v v8, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v4i32_undef_v2i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vs1r.v v8, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> undef, <2 x i32> %sv, i64 0)
   store <4 x i32> %v, ptr %vp
@@ -132,17 +176,27 @@ define void @insert_v4i32_undef_v2i32_0(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v8i32_v2i32_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
-; CHECK-NEXT:    vmv.v.v v10, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vse32.v v10, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v8i32_v2i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v10, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
+; VLA-NEXT:    vmv.v.v v10, v8
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vse32.v v10, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v8i32_v2i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vl2re32.v v10, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
+; VLS-NEXT:    vmv.v.v v10, v8
+; VLS-NEXT:    vs2r.v v10, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <8 x i32>, ptr %vp
   %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 0)
@@ -151,17 +205,27 @@ define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v8i32_v2i32_2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
-; CHECK-NEXT:    vslideup.vi v10, v8, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vse32.v v10, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v8i32_v2i32_2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v10, (a0)
+; VLA-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
+; VLA-NEXT:    vslideup.vi v10, v8, 2
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vse32.v v10, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v8i32_v2i32_2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vle32.v v10, (a1)
+; VLS-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v10, 2
+; VLS-NEXT:    vs2r.v v8, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <8 x i32>, ptr %vp
   %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 2)
@@ -170,15 +234,25 @@ define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v8i32_v2i32_6(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v8i32_v2i32_6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vslideup.vi v10, v8, 6
-; CHECK-NEXT:    vse32.v v10, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v8i32_v2i32_6:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v10, (a0)
+; VLA-NEXT:    vslideup.vi v10, v8, 6
+; VLA-NEXT:    vse32.v v10, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v8i32_v2i32_6:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vle32.v v10, (a1)
+; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLS-NEXT:    vslideup.vi v8, v10, 6
+; VLS-NEXT:    vs2r.v v8, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <8 x i32>, ptr %vp
   %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 6)
@@ -187,14 +261,23 @@ define void @insert_v8i32_v2i32_6(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v8i32_undef_v2i32_6(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v8i32_undef_v2i32_6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v8, 6
-; CHECK-NEXT:    vse32.v v10, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v8i32_undef_v2i32_6:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v10, v8, 6
+; VLA-NEXT:    vse32.v v10, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v8i32_undef_v2i32_6:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLS-NEXT:    vslideup.vi v10, v8, 6
+; VLS-NEXT:    vs2r.v v10, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> undef, <2 x i32> %sv, i64 6)
   store <8 x i32> %v, ptr %vp
@@ -239,18 +322,30 @@ define void @insert_v4i16_v2i16_2(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v32i1_v8i1_0(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v32i1_v8i1_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 32
-; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
-; CHECK-NEXT:    vlm.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vlm.v v9, (a1)
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf4, tu, ma
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
-; CHECK-NEXT:    vsm.v v8, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v32i1_v8i1_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 32
+; VLA-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vlm.v v9, (a1)
+; VLA-NEXT:    vsetivli zero, 1, e8, mf4, tu, ma
+; VLA-NEXT:    vmv.v.v v8, v9
+; VLA-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; VLA-NEXT:    vsm.v v8, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v32i1_v8i1_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; VLS-NEXT:    vlm.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vlm.v v9, (a1)
+; VLS-NEXT:    vsetivli zero, 1, e8, mf4, tu, ma
+; VLS-NEXT:    vmv.v.v v8, v9
+; VLS-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; VLS-NEXT:    vsm.v v8, (a0)
+; VLS-NEXT:    ret
   %v = load <32 x i1>, ptr %vp
   %sv = load <8 x i1>, ptr %svp
   %c = call <32 x i1> @llvm.vector.insert.v8i1.v32i1(<32 x i1> %v, <8 x i1> %sv, i64 0)
@@ -259,18 +354,30 @@ define void @insert_v32i1_v8i1_0(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v32i1_v8i1_16(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v32i1_v8i1_16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 32
-; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
-; CHECK-NEXT:    vlm.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vlm.v v9, (a1)
-; CHECK-NEXT:    vsetivli zero, 3, e8, mf4, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
-; CHECK-NEXT:    vsm.v v8, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v32i1_v8i1_16:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 32
+; VLA-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vlm.v v9, (a1)
+; VLA-NEXT:    vsetivli zero, 3, e8, mf4, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v9, 2
+; VLA-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; VLA-NEXT:    vsm.v v8, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v32i1_v8i1_16:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; VLS-NEXT:    vlm.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vlm.v v9, (a1)
+; VLS-NEXT:    vsetivli zero, 3, e8, mf4, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v9, 2
+; VLS-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; VLS-NEXT:    vsm.v v8, (a0)
+; VLS-NEXT:    ret
   %v = load <32 x i1>, ptr %vp
   %sv = load <8 x i1>, ptr %svp
   %c = call <32 x i1> @llvm.vector.insert.v8i1.v32i1(<32 x i1> %v, <8 x i1> %sv, i64 16)
@@ -358,22 +465,36 @@ define <vscale x 2 x i16> @insert_nxv2i16_v2i16_2(<vscale x 2 x i16> %v, ptr %sv
 }
 
 define <vscale x 2 x i1> @insert_nxv2i1_v4i1_0(<vscale x 2 x i1> %v, ptr %svp) {
-; CHECK-LABEL: insert_nxv2i1_v4i1_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vlm.v v8, (a0)
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, tu, ma
-; CHECK-NEXT:    vmv.v.v v9, v8
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmsne.vi v0, v9, 0
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_nxv2i1_v4i1_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vmerge.vim v9, v9, 1, v0
+; VLA-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; VLA-NEXT:    vmv.v.i v10, 0
+; VLA-NEXT:    vmv1r.v v0, v8
+; VLA-NEXT:    vmerge.vim v8, v10, 1, v0
+; VLA-NEXT:    vsetvli zero, zero, e8, mf4, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; VLA-NEXT:    vmsne.vi v0, v9, 0
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv2i1_v4i1_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; VLS-NEXT:    vlm.v v8, (a0)
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vmerge.vim v10, v9, 1, v0
+; VLS-NEXT:    vmv1r.v v0, v8
+; VLS-NEXT:    vmerge.vim v8, v9, 1, v0
+; VLS-NEXT:    vsetvli zero, zero, e8, mf4, tu, ma
+; VLS-NEXT:    vmv.v.v v10, v8
+; VLS-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v10, 0
+; VLS-NEXT:    ret
   %sv = load <4 x i1>, ptr %svp
   %c = call <vscale x 2 x i1> @llvm.vector.insert.v4i1.nxv2i1(<vscale x 2 x i1> %v, <4 x i1> %sv, i64 0)
   ret <vscale x 2 x i1> %c
@@ -408,15 +529,24 @@ define <vscale x 8 x i1> @insert_nxv8i1_v8i1_16(<vscale x 8 x i1> %v, ptr %svp)
 declare <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64>, <2 x i64>, i64)
 
 define void @insert_v2i64_nxv16i64(ptr %psv0, ptr %psv1, ptr %out) {
-; CHECK-LABEL: insert_v2i64_nxv16i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    vsetivli zero, 6, e64, m8, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v16, 4
-; CHECK-NEXT:    vs8r.v v8, (a2)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v2i64_nxv16i64:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; VLA-NEXT:    vle64.v v8, (a0)
+; VLA-NEXT:    vle64.v v16, (a1)
+; VLA-NEXT:    vsetivli zero, 6, e64, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v16, 4
+; VLA-NEXT:    vs8r.v v8, (a2)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v2i64_nxv16i64:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl1re64.v v8, (a0)
+; VLS-NEXT:    vl1re64.v v16, (a1)
+; VLS-NEXT:    vsetivli zero, 6, e64, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v16, 4
+; VLS-NEXT:    vs8r.v v8, (a2)
+; VLS-NEXT:    ret
   %sv0 = load <2 x i64>, ptr %psv0
   %sv1 = load <2 x i64>, ptr %psv1
   %v0 = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv0, i64 0)
@@ -426,12 +556,18 @@ define void @insert_v2i64_nxv16i64(ptr %psv0, ptr %psv1, ptr %out) {
 }
 
 define void @insert_v2i64_nxv16i64_lo0(ptr %psv, ptr %out) {
-; CHECK-LABEL: insert_v2i64_nxv16i64_lo0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vs8r.v v8, (a1)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v2i64_nxv16i64_lo0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; VLA-NEXT:    vle64.v v8, (a0)
+; VLA-NEXT:    vs8r.v v8, (a1)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v2i64_nxv16i64_lo0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl1re64.v v8, (a0)
+; VLS-NEXT:    vs8r.v v8, (a1)
+; VLS-NEXT:    ret
   %sv = load <2 x i64>, ptr %psv
   %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 0)
   store <vscale x 16 x i64> %v, ptr %out
@@ -439,14 +575,22 @@ define void @insert_v2i64_nxv16i64_lo0(ptr %psv, ptr %out) {
 }
 
 define void @insert_v2i64_nxv16i64_lo2(ptr %psv, ptr %out) {
-; CHECK-LABEL: insert_v2i64_nxv16i64_lo2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e64, m8, ta, ma
-; CHECK-NEXT:    vslideup.vi v16, v8, 2
-; CHECK-NEXT:    vs8r.v v16, (a1)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v2i64_nxv16i64_lo2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; VLA-NEXT:    vle64.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 4, e64, m8, ta, ma
+; VLA-NEXT:    vslideup.vi v16, v8, 2
+; VLA-NEXT:    vs8r.v v16, (a1)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v2i64_nxv16i64_lo2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl1re64.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e64, m8, ta, ma
+; VLS-NEXT:    vslideup.vi v16, v8, 2
+; VLS-NEXT:    vs8r.v v16, (a1)
+; VLS-NEXT:    ret
   %sv = load <2 x i64>, ptr %psv
   %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 2)
   store <vscale x 16 x i64> %v, ptr %out
@@ -521,6 +665,127 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) {
 ; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 80
 ; RV64-NEXT:    ret
+; RV32VLA-LABEL: insert_v2i64_nxv16i64_hi:
+; RV32VLA:       # %bb.0:
+; RV32VLA-NEXT:    addi sp, sp, -80
+; RV32VLA-NEXT:    .cfi_def_cfa_offset 80
+; RV32VLA-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32VLA-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32VLA-NEXT:    .cfi_offset ra, -4
+; RV32VLA-NEXT:    .cfi_offset s0, -8
+; RV32VLA-NEXT:    addi s0, sp, 80
+; RV32VLA-NEXT:    .cfi_def_cfa s0, 0
+; RV32VLA-NEXT:    csrr a2, vlenb
+; RV32VLA-NEXT:    slli a2, a2, 4
+; RV32VLA-NEXT:    sub sp, sp, a2
+; RV32VLA-NEXT:    andi sp, sp, -64
+; RV32VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32VLA-NEXT:    vle64.v v8, (a0)
+; RV32VLA-NEXT:    addi a0, sp, 128
+; RV32VLA-NEXT:    vse64.v v8, (a0)
+; RV32VLA-NEXT:    csrr a0, vlenb
+; RV32VLA-NEXT:    slli a0, a0, 3
+; RV32VLA-NEXT:    addi a2, sp, 64
+; RV32VLA-NEXT:    add a3, a2, a0
+; RV32VLA-NEXT:    vl8re64.v v8, (a3)
+; RV32VLA-NEXT:    vl8re64.v v16, (a2)
+; RV32VLA-NEXT:    add a0, a1, a0
+; RV32VLA-NEXT:    vs8r.v v8, (a0)
+; RV32VLA-NEXT:    vs8r.v v16, (a1)
+; RV32VLA-NEXT:    addi sp, s0, -80
+; RV32VLA-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32VLA-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32VLA-NEXT:    addi sp, sp, 80
+; RV32VLA-NEXT:    ret
+;
+; RV64VLA-LABEL: insert_v2i64_nxv16i64_hi:
+; RV64VLA:       # %bb.0:
+; RV64VLA-NEXT:    addi sp, sp, -80
+; RV64VLA-NEXT:    .cfi_def_cfa_offset 80
+; RV64VLA-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; RV64VLA-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; RV64VLA-NEXT:    .cfi_offset ra, -8
+; RV64VLA-NEXT:    .cfi_offset s0, -16
+; RV64VLA-NEXT:    addi s0, sp, 80
+; RV64VLA-NEXT:    .cfi_def_cfa s0, 0
+; RV64VLA-NEXT:    csrr a2, vlenb
+; RV64VLA-NEXT:    slli a2, a2, 4
+; RV64VLA-NEXT:    sub sp, sp, a2
+; RV64VLA-NEXT:    andi sp, sp, -64
+; RV64VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64VLA-NEXT:    vle64.v v8, (a0)
+; RV64VLA-NEXT:    addi a0, sp, 128
+; RV64VLA-NEXT:    vse64.v v8, (a0)
+; RV64VLA-NEXT:    csrr a0, vlenb
+; RV64VLA-NEXT:    slli a0, a0, 3
+; RV64VLA-NEXT:    addi a2, sp, 64
+; RV64VLA-NEXT:    add a3, a2, a0
+; RV64VLA-NEXT:    vl8re64.v v8, (a3)
+; RV64VLA-NEXT:    vl8re64.v v16, (a2)
+; RV64VLA-NEXT:    add a0, a1, a0
+; RV64VLA-NEXT:    vs8r.v v8, (a0)
+; RV64VLA-NEXT:    vs8r.v v16, (a1)
+; RV64VLA-NEXT:    addi sp, s0, -80
+; RV64VLA-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; RV64VLA-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; RV64VLA-NEXT:    addi sp, sp, 80
+; RV64VLA-NEXT:    ret
+;
+; RV32VLS-LABEL: insert_v2i64_nxv16i64_hi:
+; RV32VLS:       # %bb.0:
+; RV32VLS-NEXT:    addi sp, sp, -80
+; RV32VLS-NEXT:    .cfi_def_cfa_offset 80
+; RV32VLS-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32VLS-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32VLS-NEXT:    .cfi_offset ra, -4
+; RV32VLS-NEXT:    .cfi_offset s0, -8
+; RV32VLS-NEXT:    addi s0, sp, 80
+; RV32VLS-NEXT:    .cfi_def_cfa s0, 0
+; RV32VLS-NEXT:    addi sp, sp, -256
+; RV32VLS-NEXT:    andi sp, sp, -64
+; RV32VLS-NEXT:    vl1re64.v v8, (a0)
+; RV32VLS-NEXT:    addi a0, sp, 128
+; RV32VLS-NEXT:    vs1r.v v8, (a0)
+; RV32VLS-NEXT:    addi a0, sp, 64
+; RV32VLS-NEXT:    addi a2, sp, 192
+; RV32VLS-NEXT:    vl8re64.v v8, (a2)
+; RV32VLS-NEXT:    vl8re64.v v16, (a0)
+; RV32VLS-NEXT:    addi a0, a1, 128
+; RV32VLS-NEXT:    vs8r.v v8, (a0)
+; RV32VLS-NEXT:    vs8r.v v16, (a1)
+; RV32VLS-NEXT:    addi sp, s0, -80
+; RV32VLS-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32VLS-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32VLS-NEXT:    addi sp, sp, 80
+; RV32VLS-NEXT:    ret
+;
+; RV64VLS-LABEL: insert_v2i64_nxv16i64_hi:
+; RV64VLS:       # %bb.0:
+; RV64VLS-NEXT:    addi sp, sp, -80
+; RV64VLS-NEXT:    .cfi_def_cfa_offset 80
+; RV64VLS-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; RV64VLS-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; RV64VLS-NEXT:    .cfi_offset ra, -8
+; RV64VLS-NEXT:    .cfi_offset s0, -16
+; RV64VLS-NEXT:    addi s0, sp, 80
+; RV64VLS-NEXT:    .cfi_def_cfa s0, 0
+; RV64VLS-NEXT:    addi sp, sp, -256
+; RV64VLS-NEXT:    andi sp, sp, -64
+; RV64VLS-NEXT:    vl1re64.v v8, (a0)
+; RV64VLS-NEXT:    addi a0, sp, 128
+; RV64VLS-NEXT:    vs1r.v v8, (a0)
+; RV64VLS-NEXT:    addi a0, sp, 192
+; RV64VLS-NEXT:    vl8re64.v v8, (a0)
+; RV64VLS-NEXT:    addi a0, sp, 64
+; RV64VLS-NEXT:    vl8re64.v v16, (a0)
+; RV64VLS-NEXT:    addi a0, a1, 128
+; RV64VLS-NEXT:    vs8r.v v8, (a0)
+; RV64VLS-NEXT:    vs8r.v v16, (a1)
+; RV64VLS-NEXT:    addi sp, s0, -80
+; RV64VLS-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; RV64VLS-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; RV64VLS-NEXT:    addi sp, sp, 80
+; RV64VLS-NEXT:    ret
   %sv = load <2 x i64>, ptr %psv
   %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 8)
   store <vscale x 16 x i64> %v, ptr %out
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
index 6ef5aa846d6d..ce8827fe4753 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
@@ -1,6 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLA %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLA %s
+
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
 
 define <8 x i32> @concat_2xv4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: concat_2xv4i32:
@@ -128,31 +131,51 @@ define <16 x i32> @concat_8xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x
 }
 
 define <32 x i32> @concat_2xv16i32(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: concat_2xv16i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv4r.v v16, v12
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v16, 16
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_2xv16i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vmv4r.v v16, v12
+; VLA-NEXT:    li a0, 32
+; VLA-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v16, 16
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_2xv16i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vmv4r.v v16, v12
+; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; VLS-NEXT:    vslideup.vi v8, v16, 16
+; VLS-NEXT:    ret
   %ab = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   ret <32 x i32> %ab
 }
 
 define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
-; CHECK-LABEL: concat_4xv8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv2r.v v16, v14
-; CHECK-NEXT:    vmv2r.v v24, v12
-; CHECK-NEXT:    vmv2r.v v0, v10
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v0, 8
-; CHECK-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v24, 16
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v16, 24
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_4xv8i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vmv2r.v v16, v14
+; VLA-NEXT:    vmv2r.v v24, v12
+; VLA-NEXT:    vmv2r.v v0, v10
+; VLA-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v0, 8
+; VLA-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v24, 16
+; VLA-NEXT:    li a0, 32
+; VLA-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v16, 24
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_4xv8i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vmv2r.v v16, v14
+; VLS-NEXT:    vmv2r.v v24, v12
+; VLS-NEXT:    vmv2r.v v0, v10
+; VLS-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v0, 8
+; VLS-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v24, 16
+; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; VLS-NEXT:    vslideup.vi v8, v16, 24
+; VLS-NEXT:    ret
   %ab = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %cd = shufflevector <8 x i32> %c, <8 x i32> %d, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %abcd = shufflevector <16 x i32> %ab, <16 x i32> %cd, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -160,82 +183,128 @@ define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x
 }
 
 define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h) {
-; CHECK-LABEL: concat_8xv4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    vmv1r.v v16, v15
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 0
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v16, v14
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v16, v13
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v16, v12
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v11
-; CHECK-NEXT:    vmv1r.v v24, v10
-; CHECK-NEXT:    vmv1r.v v16, v9
-; CHECK-NEXT:    vsetivli zero, 8, e32, m8, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v16, 4
-; CHECK-NEXT:    vsetivli zero, 12, e32, m8, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v24, 8
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v0, 12
-; CHECK-NEXT:    vsetivli zero, 20, e32, m8, tu, ma
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vslideup.vi v8, v16, 16
-; CHECK-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vslideup.vi v8, v16, 20
-; CHECK-NEXT:    vsetivli zero, 28, e32, m8, tu, ma
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vslideup.vi v8, v16, 24
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 0
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vslideup.vi v8, v16, 28
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_8xv4i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    addi sp, sp, -16
+; VLA-NEXT:    .cfi_def_cfa_offset 16
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 5
+; VLA-NEXT:    sub sp, sp, a0
+; VLA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; VLA-NEXT:    vmv1r.v v16, v15
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    li a1, 0
+; VLA-NEXT:    slli a0, a0, 3
+; VLA-NEXT:    add a1, a1, a0
+; VLA-NEXT:    slli a0, a0, 1
+; VLA-NEXT:    add a0, a0, a1
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLA-NEXT:    vmv1r.v v16, v14
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 4
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLA-NEXT:    vmv1r.v v16, v13
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 3
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLA-NEXT:    vmv1r.v v16, v12
+; VLA-NEXT:    addi a0, sp, 16
+; VLA-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLA-NEXT:    vmv1r.v v0, v11
+; VLA-NEXT:    vmv1r.v v24, v10
+; VLA-NEXT:    vmv1r.v v16, v9
+; VLA-NEXT:    vsetivli zero, 8, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v16, 4
+; VLA-NEXT:    vsetivli zero, 12, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v24, 8
+; VLA-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v0, 12
+; VLA-NEXT:    vsetivli zero, 20, e32, m8, tu, ma
+; VLA-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLA-NEXT:    vslideup.vi v8, v16, 16
+; VLA-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 3
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLA-NEXT:    vslideup.vi v8, v16, 20
+; VLA-NEXT:    vsetivli zero, 28, e32, m8, tu, ma
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 4
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLA-NEXT:    vslideup.vi v8, v16, 24
+; VLA-NEXT:    li a0, 32
+; VLA-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    li a1, 0
+; VLA-NEXT:    slli a0, a0, 3
+; VLA-NEXT:    add a1, a1, a0
+; VLA-NEXT:    slli a0, a0, 1
+; VLA-NEXT:    add a0, a0, a1
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLA-NEXT:    vslideup.vi v8, v16, 28
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 5
+; VLA-NEXT:    add sp, sp, a0
+; VLA-NEXT:    addi sp, sp, 16
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_8xv4i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    addi sp, sp, -16
+; VLS-NEXT:    .cfi_def_cfa_offset 16
+; VLS-NEXT:    addi sp, sp, -512
+; VLS-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; VLS-NEXT:    vmv1r.v v16, v15
+; VLS-NEXT:    addi a0, sp, 400
+; VLS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLS-NEXT:    vmv1r.v v16, v14
+; VLS-NEXT:    addi a0, sp, 272
+; VLS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLS-NEXT:    vmv1r.v v16, v13
+; VLS-NEXT:    addi a0, sp, 144
+; VLS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLS-NEXT:    vmv1r.v v16, v12
+; VLS-NEXT:    addi a0, sp, 16
+; VLS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLS-NEXT:    vmv1r.v v0, v11
+; VLS-NEXT:    vmv1r.v v24, v10
+; VLS-NEXT:    vmv1r.v v16, v9
+; VLS-NEXT:    vsetivli zero, 8, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v16, 4
+; VLS-NEXT:    vsetivli zero, 12, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v24, 8
+; VLS-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v0, 12
+; VLS-NEXT:    vsetivli zero, 20, e32, m8, tu, ma
+; VLS-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLS-NEXT:    vslideup.vi v8, v16, 16
+; VLS-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; VLS-NEXT:    addi a0, sp, 144
+; VLS-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLS-NEXT:    vslideup.vi v8, v16, 20
+; VLS-NEXT:    vsetivli zero, 28, e32, m8, tu, ma
+; VLS-NEXT:    addi a0, sp, 272
+; VLS-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLS-NEXT:    vslideup.vi v8, v16, 24
+; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; VLS-NEXT:    addi a0, sp, 400
+; VLS-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLS-NEXT:    vslideup.vi v8, v16, 28
+; VLS-NEXT:    addi sp, sp, 512
+; VLS-NEXT:    addi sp, sp, 16
+; VLS-NEXT:    ret
   %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %abcd = shufflevector <8 x i32> %ab, <8 x i32> %cd, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index 208c881294ee..48ce7d623475 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -4827,10 +4827,10 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
 ; CHECK-NOV-NEXT:    call __extendhfsf2
-; CHECK-NOV-NEXT:    fmv.s fs5, fa0
+; CHECK-NOV-NEXT:    fmv.s fs6, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
 ; CHECK-NOV-NEXT:    call __extendhfsf2
-; CHECK-NOV-NEXT:    fmv.s fs6, fa0
+; CHECK-NOV-NEXT:    fmv.s fs5, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs4, fa0
@@ -4846,54 +4846,36 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fcvt.lu.s s2, fs6, rtz
-; CHECK-NOV-NEXT:    fcvt.lu.s a0, fs5, rtz
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
-; CHECK-NOV-NEXT:    sext.w s1, a0
+; CHECK-NOV-NEXT:    fcvt.lu.s s1, fs6, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-NOV-NEXT:    sext.w a0, a0
 ; CHECK-NOV-NEXT:    lui a1, 16
 ; CHECK-NOV-NEXT:    addiw a1, a1, -1
-; CHECK-NOV-NEXT:    bltu a0, a1, .LBB43_2
+; CHECK-NOV-NEXT:    bgeu a0, a1, .LBB43_10
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
-; CHECK-NOV-NEXT:    mv a0, a1
+; CHECK-NOV-NEXT:    fcvt.lu.s a2, fs5, rtz
+; CHECK-NOV-NEXT:    bgeu s1, a1, .LBB43_11
 ; CHECK-NOV-NEXT:  .LBB43_2: # %entry
 ; CHECK-NOV-NEXT:    fcvt.lu.s a3, fs4, rtz
-; CHECK-NOV-NEXT:    sext.w a2, s2
-; CHECK-NOV-NEXT:    bltu s1, a1, .LBB43_4
-; CHECK-NOV-NEXT:  # %bb.3: # %entry
-; CHECK-NOV-NEXT:    mv s1, a1
-; CHECK-NOV-NEXT:  .LBB43_4: # %entry
+; CHECK-NOV-NEXT:    bgeu a2, a1, .LBB43_12
+; CHECK-NOV-NEXT:  .LBB43_3: # %entry
 ; CHECK-NOV-NEXT:    fcvt.lu.s a4, fs3, rtz
-; CHECK-NOV-NEXT:    sext.w a3, a3
-; CHECK-NOV-NEXT:    bltu a2, a1, .LBB43_6
-; CHECK-NOV-NEXT:  # %bb.5: # %entry
-; CHECK-NOV-NEXT:    mv a2, a1
-; CHECK-NOV-NEXT:  .LBB43_6: # %entry
+; CHECK-NOV-NEXT:    bgeu a3, a1, .LBB43_13
+; CHECK-NOV-NEXT:  .LBB43_4: # %entry
 ; CHECK-NOV-NEXT:    fcvt.lu.s a5, fs2, rtz
-; CHECK-NOV-NEXT:    sext.w a4, a4
-; CHECK-NOV-NEXT:    bltu a3, a1, .LBB43_8
-; CHECK-NOV-NEXT:  # %bb.7: # %entry
-; CHECK-NOV-NEXT:    mv a3, a1
-; CHECK-NOV-NEXT:  .LBB43_8: # %entry
+; CHECK-NOV-NEXT:    bgeu a4, a1, .LBB43_14
+; CHECK-NOV-NEXT:  .LBB43_5: # %entry
 ; CHECK-NOV-NEXT:    fcvt.lu.s a6, fs1, rtz
-; CHECK-NOV-NEXT:    sext.w a5, a5
-; CHECK-NOV-NEXT:    bltu a4, a1, .LBB43_10
-; CHECK-NOV-NEXT:  # %bb.9: # %entry
-; CHECK-NOV-NEXT:    mv a4, a1
-; CHECK-NOV-NEXT:  .LBB43_10: # %entry
-; CHECK-NOV-NEXT:    fcvt.lu.s a7, fs0, rtz
-; CHECK-NOV-NEXT:    sext.w a6, a6
 ; CHECK-NOV-NEXT:    bgeu a5, a1, .LBB43_15
-; CHECK-NOV-NEXT:  # %bb.11: # %entry
-; CHECK-NOV-NEXT:    sext.w a7, a7
+; CHECK-NOV-NEXT:  .LBB43_6: # %entry
+; CHECK-NOV-NEXT:    fcvt.lu.s a7, fs0, rtz
 ; CHECK-NOV-NEXT:    bgeu a6, a1, .LBB43_16
-; CHECK-NOV-NEXT:  .LBB43_12: # %entry
-; CHECK-NOV-NEXT:    bltu a7, a1, .LBB43_14
-; CHECK-NOV-NEXT:  .LBB43_13: # %entry
+; CHECK-NOV-NEXT:  .LBB43_7: # %entry
+; CHECK-NOV-NEXT:    bltu a7, a1, .LBB43_9
+; CHECK-NOV-NEXT:  .LBB43_8: # %entry
 ; CHECK-NOV-NEXT:    mv a7, a1
-; CHECK-NOV-NEXT:  .LBB43_14: # %entry
+; CHECK-NOV-NEXT:  .LBB43_9: # %entry
 ; CHECK-NOV-NEXT:    sh a7, 14(s0)
 ; CHECK-NOV-NEXT:    sh a6, 12(s0)
 ; CHECK-NOV-NEXT:    sh a5, 10(s0)
@@ -4920,14 +4902,34 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    fld fs6, 0(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    addi sp, sp, 128
 ; CHECK-NOV-NEXT:    ret
+; CHECK-NOV-NEXT:  .LBB43_10: # %entry
+; CHECK-NOV-NEXT:    mv a0, a1
+; CHECK-NOV-NEXT:    fcvt.lu.s a2, fs5, rtz
+; CHECK-NOV-NEXT:    bltu s1, a1, .LBB43_2
+; CHECK-NOV-NEXT:  .LBB43_11: # %entry
+; CHECK-NOV-NEXT:    mv s1, a1
+; CHECK-NOV-NEXT:    fcvt.lu.s a3, fs4, rtz
+; CHECK-NOV-NEXT:    bltu a2, a1, .LBB43_3
+; CHECK-NOV-NEXT:  .LBB43_12: # %entry
+; CHECK-NOV-NEXT:    mv a2, a1
+; CHECK-NOV-NEXT:    fcvt.lu.s a4, fs3, rtz
+; CHECK-NOV-NEXT:    bltu a3, a1, .LBB43_4
+; CHECK-NOV-NEXT:  .LBB43_13: # %entry
+; CHECK-NOV-NEXT:    mv a3, a1
+; CHECK-NOV-NEXT:    fcvt.lu.s a5, fs2, rtz
+; CHECK-NOV-NEXT:    bltu a4, a1, .LBB43_5
+; CHECK-NOV-NEXT:  .LBB43_14: # %entry
+; CHECK-NOV-NEXT:    mv a4, a1
+; CHECK-NOV-NEXT:    fcvt.lu.s a6, fs1, rtz
+; CHECK-NOV-NEXT:    bltu a5, a1, .LBB43_6
 ; CHECK-NOV-NEXT:  .LBB43_15: # %entry
 ; CHECK-NOV-NEXT:    mv a5, a1
-; CHECK-NOV-NEXT:    sext.w a7, a7
-; CHECK-NOV-NEXT:    bltu a6, a1, .LBB43_12
+; CHECK-NOV-NEXT:    fcvt.lu.s a7, fs0, rtz
+; CHECK-NOV-NEXT:    bltu a6, a1, .LBB43_7
 ; CHECK-NOV-NEXT:  .LBB43_16: # %entry
 ; CHECK-NOV-NEXT:    mv a6, a1
-; CHECK-NOV-NEXT:    bgeu a7, a1, .LBB43_13
-; CHECK-NOV-NEXT:    j .LBB43_14
+; CHECK-NOV-NEXT:    bgeu a7, a1, .LBB43_8
+; CHECK-NOV-NEXT:    j .LBB43_9
 ;
 ; CHECK-V-LABEL: utesth_f16i16_mm:
 ; CHECK-V:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll b/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll
index 1d42b6e3937c..bd0fecd28551 100644
--- a/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll
@@ -52,47 +52,27 @@ define i32 @vector_length_i16_fixed(i16 zeroext %tc) {
 }
 
 define i32 @vector_length_i32_fixed(i32 zeroext %tc) {
-; RV32-LABEL: vector_length_i32_fixed:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 2
-; RV32-NEXT:    bltu a0, a1, .LBB4_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a0, 2
-; RV32-NEXT:  .LBB4_2:
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vector_length_i32_fixed:
-; RV64:       # %bb.0:
-; RV64-NEXT:    sext.w a0, a0
-; RV64-NEXT:    li a1, 2
-; RV64-NEXT:    bltu a0, a1, .LBB4_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a0, 2
-; RV64-NEXT:  .LBB4_2:
-; RV64-NEXT:    ret
+; CHECK-LABEL: vector_length_i32_fixed:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 2
+; CHECK-NEXT:    bltu a0, a1, .LBB4_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 2
+; CHECK-NEXT:  .LBB4_2:
+; CHECK-NEXT:    ret
   %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 2, i1 false)
   ret i32 %a
 }
 
 define i32 @vector_length_XLen_fixed(iXLen zeroext %tc) {
-; RV32-LABEL: vector_length_XLen_fixed:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 2
-; RV32-NEXT:    bltu a0, a1, .LBB5_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a0, 2
-; RV32-NEXT:  .LBB5_2:
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vector_length_XLen_fixed:
-; RV64:       # %bb.0:
-; RV64-NEXT:    sext.w a0, a0
-; RV64-NEXT:    li a1, 2
-; RV64-NEXT:    bltu a0, a1, .LBB5_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a0, 2
-; RV64-NEXT:  .LBB5_2:
-; RV64-NEXT:    ret
+; CHECK-LABEL: vector_length_XLen_fixed:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 2
+; CHECK-NEXT:    bltu a0, a1, .LBB5_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 2
+; CHECK-NEXT:  .LBB5_2:
+; CHECK-NEXT:    ret
   %a = call i32 @llvm.experimental.get.vector.length.iXLen(iXLen %tc, i32 2, i1 false)
   ret i32 %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
index 0f3f57a0dec5..b15896580d42 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
@@ -76,7 +76,7 @@ define <vscale x 4 x i8> @insert_nxv1i8_nxv4i8_3(<vscale x 4 x i8> %vec, <vscale
 ; CHECK-NEXT:    slli a1, a0, 1
 ; CHECK-NEXT:    add a1, a1, a0
 ; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x i8> @llvm.vector.insert.nxv1i8.nxv4i8(<vscale x 4 x i8> %vec, <vscale x 1 x i8> %subvec, i64 3)
@@ -227,7 +227,7 @@ define <vscale x 16 x i32> @insert_nxv16i32_nxv1i32_1(<vscale x 16 x i32> %vec,
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x i32> @llvm.vector.insert.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec, i64 1)
@@ -306,7 +306,7 @@ define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_7(<vscale x 16 x i8> %vec, <vsc
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    sub a1, a0, a1
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x i8> @llvm.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 7)
@@ -319,7 +319,7 @@ define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_15(<vscale x 16 x i8> %vec, <vs
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    sub a1, a0, a1
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v10, a1
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x i8> @llvm.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 15)
@@ -344,7 +344,7 @@ define <vscale x 32 x half> @insert_nxv32f16_nxv2f16_2(<vscale x 32 x half> %vec
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vector.insert.nxv2f16.nxv32f16(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec, i64 2)
@@ -357,7 +357,7 @@ define <vscale x 32 x half> @insert_nxv32f16_nxv2f16_26(<vscale x 32 x half> %ve
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v14, v16, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vector.insert.nxv2f16.nxv32f16(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec, i64 26)
diff --git a/llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll b/llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll
index df944fada796..c26532d35595 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll
@@ -58,7 +58,7 @@ define void @strided_store_zero_start(i64 %n, ptr %p) {
 ; RV64-NEXT:    ret
   %step = tail call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
   %gep = getelementptr inbounds %struct, ptr %p, <vscale x 1 x i64> %step, i32 6
-  tail call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x ptr> %gep, i32 8, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+  tail call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x ptr> %gep, i32 8, <vscale x 1 x i1> splat (i1 true))
   ret void
 }
 
@@ -93,7 +93,7 @@ define void @strided_store_offset_start(i64 %n, ptr %p) {
   %.splat = shufflevector <vscale x 1 x i64> %.splatinsert, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
   %add = add <vscale x 1 x i64> %step, %.splat
   %gep = getelementptr inbounds %struct, ptr %p, <vscale x 1 x i64> %add, i32 6
-  tail call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x ptr> %gep, i32 8, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+  tail call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x ptr> %gep, i32 8, <vscale x 1 x i1> splat (i1 true))
   ret void
 }
 
@@ -118,7 +118,7 @@ define void @stride_one_store(i64 %n, ptr %p) {
 ; RV64-NEXT:    ret
   %step = tail call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
   %gep = getelementptr inbounds i64, ptr %p, <vscale x 1 x i64> %step
-  tail call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x ptr> %gep, i32 8, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+  tail call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x ptr> %gep, i32 8, <vscale x 1 x i1> splat (i1 true))
   ret void
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr61561.ll b/llvm/test/CodeGen/RISCV/rvv/pr61561.ll
index f27edd361166..c5fd6943e51b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr61561.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr61561.ll
@@ -23,11 +23,11 @@ define <vscale x 4 x i8> @foo(ptr %p) {
 ; CHECK-NEXT:    ret
   %i13 = load <vscale x 4 x i16>, ptr %p, align 2
   %i14 = zext <vscale x 4 x i16> %i13 to <vscale x 4 x i32>
-  %i15 = shl nuw nsw <vscale x 4 x i32> %i14, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-  %i16 = and <vscale x 4 x i32> %i15, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 248, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-  %i17 = mul nuw nsw <vscale x 4 x i32> %i16, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3735, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-  %i18 = add nuw nsw <vscale x 4 x i32> %i17, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 16384, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-  %i21 = lshr <vscale x 4 x i32> %i18, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 15, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %i15 = shl nuw nsw <vscale x 4 x i32> %i14, splat (i32 3)
+  %i16 = and <vscale x 4 x i32> %i15, splat (i32 248)
+  %i17 = mul nuw nsw <vscale x 4 x i32> %i16, splat (i32 3735)
+  %i18 = add nuw nsw <vscale x 4 x i32> %i17, splat (i32 16384)
+  %i21 = lshr <vscale x 4 x i32> %i18, splat (i32 15)
   %i22 = trunc <vscale x 4 x i32> %i21 to <vscale x 4 x i8>
   ret <vscale x 4 x i8> %i22
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr63459.ll b/llvm/test/CodeGen/RISCV/rvv/pr63459.ll
index c871e2992a5e..5ef8e18bb264 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr63459.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr63459.ll
@@ -14,7 +14,7 @@ define void @snork(ptr %arg, <vscale x 2 x i64> %arg1) {
 ; CHECK-NEXT:    ret
 bb:
   %getelementptr = getelementptr inbounds <vscale x 2 x i32>, ptr %arg, <vscale x 2 x i64> %arg1
-  tail call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x ptr> align 4 %getelementptr, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 4)
+  tail call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> splat (i32 1), <vscale x 2 x ptr> align 4 %getelementptr, <vscale x 2 x i1> splat (i1 true), i32 4)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index 243dc19a2558..1ef63ffa9ee0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -2235,9 +2235,9 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFH-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFH-NEXT:    vmfeq.vv v16, v8, v24, v0.t
 ; ZVFH-NEXT:    add a0, a1, a1
-; ZVFH-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; ZVFH-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; ZVFH-NEXT:    vslideup.vx v16, v1, a1
-; ZVFH-NEXT:    vmv1r.v v0, v16
+; ZVFH-NEXT:    vmv.v.v v0, v16
 ; ZVFH-NEXT:    csrr a0, vlenb
 ; ZVFH-NEXT:    slli a0, a0, 4
 ; ZVFH-NEXT:    add sp, sp, a0
@@ -2337,7 +2337,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:  # %bb.3:
 ; ZVFHMIN-NEXT:    mv a2, a5
 ; ZVFHMIN-NEXT:  .LBB85_4:
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v2, v26, a3
 ; ZVFHMIN-NEXT:    sub a5, a2, a4
 ; ZVFHMIN-NEXT:    sltu a6, a2, a5
@@ -2395,12 +2395,12 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v1
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v8, v3, a3
 ; ZVFHMIN-NEXT:    add a0, a1, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v8, v2, a1
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 34
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
@@ -3637,7 +3637,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK-NEXT:    slli a0, a1, 1
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v17, v16, a0
 ; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    csrr a0, vlenb
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
index e77966d8c43b..aee255196ce2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
@@ -3387,7 +3387,7 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    srli a0, a0, 3
 ; RV32-NEXT:    add a1, a0, a0
-; RV32-NEXT:    vsetvli zero, a1, e8, mf4, tu, ma
+; RV32-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; RV32-NEXT:    vslideup.vx v0, v24, a0
 ; RV32-NEXT:    ret
 ;
@@ -3400,7 +3400,7 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    srli a0, a0, 3
 ; RV64-NEXT:    add a1, a0, a0
-; RV64-NEXT:    vsetvli zero, a1, e8, mf4, tu, ma
+; RV64-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; RV64-NEXT:    vslideup.vx v0, v24, a0
 ; RV64-NEXT:    ret
 ;
@@ -3413,7 +3413,7 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; ZVFHMIN32-NEXT:    csrr a0, vlenb
 ; ZVFHMIN32-NEXT:    srli a0, a0, 3
 ; ZVFHMIN32-NEXT:    add a1, a0, a0
-; ZVFHMIN32-NEXT:    vsetvli zero, a1, e8, mf4, tu, ma
+; ZVFHMIN32-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; ZVFHMIN32-NEXT:    vslideup.vx v0, v24, a0
 ; ZVFHMIN32-NEXT:    ret
 ;
@@ -3426,7 +3426,7 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; ZVFHMIN64-NEXT:    csrr a0, vlenb
 ; ZVFHMIN64-NEXT:    srli a0, a0, 3
 ; ZVFHMIN64-NEXT:    add a1, a0, a0
-; ZVFHMIN64-NEXT:    vsetvli zero, a1, e8, mf4, tu, ma
+; ZVFHMIN64-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; ZVFHMIN64-NEXT:    vslideup.vx v0, v24, a0
 ; ZVFHMIN64-NEXT:    ret
   %vc = fcmp oeq <vscale x 16 x double> %va, zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
index 007afe12b8e4..a23b7c7b6ae9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
@@ -2424,7 +2424,7 @@ define <vscale x 32 x i1> @icmp_eq_vv_nxv32i32(<vscale x 32 x i32> %va, <vscale
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmseq.vv v16, v8, v24, v0.t
 ; CHECK-NEXT:    add a0, a1, a1
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v16, v1, a1
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -2459,7 +2459,7 @@ define <vscale x 32 x i1> @icmp_eq_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b,
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vmseq.vx v16, v8, a0, v0.t
 ; CHECK-NEXT:    add a0, a2, a2
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v16, v25, a2
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    ret
@@ -2492,7 +2492,7 @@ define <vscale x 32 x i1> @icmp_eq_vx_swap_nxv32i32(<vscale x 32 x i32> %va, i32
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vmseq.vx v16, v8, a0, v0.t
 ; CHECK-NEXT:    add a0, a2, a2
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v16, v25, a2
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll
index a2ac684604b9..5f35a4e50a95 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll
@@ -3235,7 +3235,7 @@ define <vscale x 16 x i1> @icmp_eq_vi_nx16i64(<vscale x 16 x i64> %va) {
 ; CHECK-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vi v24, v16, 0
 ; CHECK-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v0, v24, a0
 ; CHECK-NEXT:    ret
   %vc = icmp eq <vscale x 16 x i64> %va, zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
index 2d65c9d178b7..8f02ca653581 100644
--- a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
@@ -743,7 +743,7 @@ define <vscale x 2 x i64> @hi_bits_known_zero() vscale_range(2, 4) {
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    ret
   %step = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-  %and = and <vscale x 2 x i64> %step, shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 u0xfffffffffffffff8, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %and = and <vscale x 2 x i64> %step, splat (i64 u0xfffffffffffffff8)
   ret <vscale x 2 x i64> %and
 }
 
@@ -758,8 +758,8 @@ define <vscale x 2 x i64> @hi_bits_known_zero_overflow() vscale_range(2, 4) {
 ; CHECK-NEXT:    vand.vi v8, v8, -8
 ; CHECK-NEXT:    ret
   %step = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-  %step.mul = mul <vscale x 2 x i64> %step, shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 u0xffffffffffffffff, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-  %and = and <vscale x 2 x i64> %step.mul, shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 u0xfffffffffffffff8, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %step.mul = mul <vscale x 2 x i64> %step, splat (i64 u0xffffffffffffffff)
+  %and = and <vscale x 2 x i64> %step.mul, splat (i64 u0xfffffffffffffff8)
   ret <vscale x 2 x i64> %and
 }
 
@@ -771,7 +771,7 @@ define <vscale x 2 x i64> @lo_bits_known_zero() {
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    ret
   %step = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-  %step.mul = mul <vscale x 2 x i64> %step, shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 8, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-  %and = and <vscale x 2 x i64> %step.mul, shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 7, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %step.mul = mul <vscale x 2 x i64> %step, splat (i64 8)
+  %and = and <vscale x 2 x i64> %step.mul, splat (i64 7)
   ret <vscale x 2 x i64> %and
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
index 54e5d39e2485..6b584cfb22a5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
@@ -16,7 +16,7 @@ define <vscale x 1 x i64> @gather(ptr %a, i32 %len) {
 ; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]], i32 3
-; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 1 x i64> @llvm.riscv.masked.strided.load.nxv1i64.p0.i64(<vscale x 1 x i64> undef, ptr [[TMP1]], i64 16, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 1 x i64> @llvm.riscv.masked.strided.load.nxv1i64.p0.i64(<vscale x 1 x i64> undef, ptr [[TMP1]], i64 16, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[ACCUM_NEXT]] = add <vscale x 1 x i64> [[ACCUM]], [[GATHER]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP0]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], [[TMP0]]
@@ -38,7 +38,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %vec.ind = phi <vscale x 1 x i64> [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ]
   %accum = phi <vscale x 1 x i64> [ zeroinitializer, %vector.ph ], [ %accum.next, %vector.body ]
   %2 = getelementptr inbounds %struct.foo, ptr %a, <vscale x 1 x i64> %vec.ind, i32 3
-  %gather = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> %2, i32 8, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i64> undef)
+  %gather = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> %2, i32 8, <vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> undef)
   %accum.next = add <vscale x 1 x i64> %accum, %gather
   %index.next = add nuw i64 %index, %0
   %vec.ind.next = add <vscale x 1 x i64> %vec.ind, %.splat
@@ -59,7 +59,7 @@ define <vscale x 1 x i64> @gather_disjoint_or(ptr %a, i64 %len) {
 ; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 1, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i64, ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]]
-; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 1 x i64> @llvm.riscv.masked.strided.load.nxv1i64.p0.i64(<vscale x 1 x i64> poison, ptr [[TMP0]], i64 16, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 1 x i64> @llvm.riscv.masked.strided.load.nxv1i64.p0.i64(<vscale x 1 x i64> poison, ptr [[TMP0]], i64 16, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[ACCUM_NEXT]] = add <vscale x 1 x i64> [[ACCUM]], [[GATHER]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[VSCALE]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 2
@@ -71,7 +71,7 @@ define <vscale x 1 x i64> @gather_disjoint_or(ptr %a, i64 %len) {
 vector.ph:
   %vscale = call i64 @llvm.vscale.i64()
   %step = tail call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
-  %step.mul2 = shl <vscale x 1 x i64> %step, shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+  %step.mul2 = shl <vscale x 1 x i64> %step, splat (i64 1)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -80,19 +80,19 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
   %accum = phi <vscale x 1 x i64> [ zeroinitializer, %vector.ph ], [ %accum.next, %vector.body ]
 
-  %vec.ind.or = or disjoint <vscale x 1 x i64> %vec.ind, shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+  %vec.ind.or = or disjoint <vscale x 1 x i64> %vec.ind, splat (i64 1)
 
   %gep = getelementptr i64, ptr %a, <vscale x 1 x i64> %vec.ind.or
   %gather = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(
   <vscale x 1 x ptr> %gep,
   i32 8,
-  <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer),
+  <vscale x 1 x i1> splat (i1 true),
   <vscale x 1 x i64> poison
   )
 
   %accum.next = add <vscale x 1 x i64> %accum, %gather
   %index.next = add nuw i64 %index, %vscale
-  %vec.ind.next = add <vscale x 1 x i64> %vec.ind, shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 2, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+  %vec.ind.next = add <vscale x 1 x i64> %vec.ind, splat (i64 2)
 
   %exit = icmp ne i64 %index.next, %len
   br i1 %exit, label %for.cond.cleanup, label %vector.body
@@ -111,7 +111,7 @@ define void @scatter(ptr %a, i32 %len) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]], i32 3
-; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.nxv1i64.p0.i64(<vscale x 1 x i64> zeroinitializer, ptr [[TMP1]], i64 16, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.nxv1i64.p0.i64(<vscale x 1 x i64> zeroinitializer, ptr [[TMP1]], i64 16, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP0]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]]
@@ -131,7 +131,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.ind = phi <vscale x 1 x i64> [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ]
   %2 = getelementptr inbounds %struct.foo, ptr %a, <vscale x 1 x i64> %vec.ind, i32 3
-  tail call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x ptr> %2, i32 8, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+  tail call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x ptr> %2, i32 8, <vscale x 1 x i1> splat (i1 true))
   %index.next = add nuw i64 %index, %0
   %vec.ind.next = add <vscale x 1 x i64> %vec.ind, %.splat
   %3 = icmp ne i64 %index.next, %wide.trip.count
@@ -155,7 +155,7 @@ define <vscale x 1 x i64> @gather_loopless(ptr %p, i64 %stride) {
   %x = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(
   <vscale x 1 x ptr> %ptrs,
   i32 8,
-  <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 1, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer),
+  <vscale x 1 x i1> splat (i1 1),
   <vscale x 1 x i64> poison
   )
   ret <vscale x 1 x i64> %x
@@ -175,7 +175,7 @@ define <vscale x 1 x i64> @straightline_offset_add(ptr %p, i64 %offset) {
   %x = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(
   <vscale x 1 x ptr> %ptrs,
   i32 8,
-  <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 1, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer),
+  <vscale x 1 x i1> splat (i1 1),
   <vscale x 1 x i64> poison
   )
   ret <vscale x 1 x i64> %x
@@ -188,13 +188,13 @@ define <vscale x 1 x i64> @straightline_offset_disjoint_or(ptr %p, i64 %offset)
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[X]]
 ;
   %step = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
-  %step.shl = shl <vscale x 1 x i64> %step, shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-  %offsetv = or disjoint <vscale x 1 x i64> %step.shl, shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+  %step.shl = shl <vscale x 1 x i64> %step, splat (i64 1)
+  %offsetv = or disjoint <vscale x 1 x i64> %step.shl, splat (i64 1)
   %ptrs = getelementptr i32, ptr %p, <vscale x 1 x i64> %offsetv
   %x = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(
   <vscale x 1 x ptr> %ptrs,
   i32 8,
-  <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer),
+  <vscale x 1 x i1> splat (i1 true),
   <vscale x 1 x i64> poison
   )
   ret <vscale x 1 x i64> %x
@@ -213,7 +213,7 @@ define <vscale x 1 x i64> @straightline_offset_shl(ptr %p) {
   %x = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(
   <vscale x 1 x ptr> %ptrs,
   i32 8,
-  <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 1, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer),
+  <vscale x 1 x i1> splat (i1 1),
   <vscale x 1 x i64> poison
   )
   ret <vscale x 1 x i64> %x
@@ -237,7 +237,7 @@ define <vscale x 1 x i64> @neg_shl_is_not_commutative(ptr %p) {
   %x = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(
   <vscale x 1 x ptr> %ptrs,
   i32 8,
-  <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 1, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer),
+  <vscale x 1 x i1> splat (i1 1),
   <vscale x 1 x i64> poison
   )
   ret <vscale x 1 x i64> %x
@@ -258,7 +258,7 @@ define <vscale x 1 x i64> @straightline_offset_shl_nonc(ptr %p, i64 %shift) {
   %x = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(
   <vscale x 1 x ptr> %ptrs,
   i32 8,
-  <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 1, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer),
+  <vscale x 1 x i1> splat (i1 1),
   <vscale x 1 x i64> poison
   )
   ret <vscale x 1 x i64> %x
@@ -279,7 +279,7 @@ define void @scatter_loopless(<vscale x 1 x i64> %x, ptr %p, i64 %stride) {
   <vscale x 1 x i64> %x,
   <vscale x 1 x ptr> %ptrs,
   i32 8,
-  <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 1, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+  <vscale x 1 x i1> splat (i1 1)
   )
   ret void
 }
@@ -296,7 +296,7 @@ define void @constant_stride(<vscale x 1 x i64> %x, ptr %p, i64 %stride) {
   <vscale x 1 x i64> %x,
   <vscale x 1 x ptr> %ptrs,
   i32 8,
-  <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 1, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+  <vscale x 1 x i1> splat (i1 1)
   )
   ret void
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll
new file mode 100644
index 000000000000..a8934bb25571
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=riscv64 -mattr=+v -stop-after=finalize-isel %s -o - | FileCheck %s
+
+; Test makes sure we don't store a pointer in the MachineMemOperand created for
+; these instructions. MachineMemOperand handling can't currently deal with a
+; negative stride that would allow memory before the pointer to be read.
+
+declare <vscale x 1 x i8> @llvm.experimental.vp.strided.load.nxv1i8.p0.i8(ptr, i8, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i8> @strided_vpload_nxv1i8_i8(ptr %ptr, i8 signext %stride, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+  ; CHECK-LABEL: name: strided_vpload_nxv1i8_i8
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $x10, $x11, $v0, $x12
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprnox0 = COPY $x12
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vr = COPY $v0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x11
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr = COPY $x10
+  ; CHECK-NEXT:   $v0 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[PseudoVLSE8_V_MF8_MASK:%[0-9]+]]:vrnov0 = PseudoVLSE8_V_MF8_MASK $noreg, [[COPY3]], [[COPY2]], $v0, [[COPY]], 3 /* e8 */, 1 /* ta, mu */ :: (load unknown-size, align 1)
+  ; CHECK-NEXT:   $v8 = COPY [[PseudoVLSE8_V_MF8_MASK]]
+  ; CHECK-NEXT:   PseudoRET implicit $v8
+  %load = call <vscale x 1 x i8> @llvm.experimental.vp.strided.load.nxv1i8.p0.i8(ptr %ptr, i8 %stride, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %load
+}
+
+declare void @llvm.experimental.vp.strided.store.nxv1i8.p0.i8(<vscale x 1 x i8>, ptr, i8, <vscale x 1 x i1>, i32)
+
+define void @strided_vpstore_nxv1i8_i8(<vscale x 1 x i8> %val, ptr %ptr, i8 signext %stride, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+  ; CHECK-LABEL: name: strided_vpstore_nxv1i8_i8
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $v8, $x10, $x11, $v0, $x12
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprnox0 = COPY $x12
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vr = COPY $v0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x11
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr = COPY $x10
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vr = COPY $v8
+  ; CHECK-NEXT:   $v0 = COPY [[COPY1]]
+  ; CHECK-NEXT:   PseudoVSSE8_V_MF8_MASK [[COPY4]], [[COPY3]], [[COPY2]], $v0, [[COPY]], 3 /* e8 */ :: (store unknown-size, align 1)
+  ; CHECK-NEXT:   PseudoRET
+  call void @llvm.experimental.vp.strided.store.nxv1i8.p0.i8(<vscale x 1 x i8> %val, ptr %ptr, i8 %stride, <vscale x 1 x i1> %m, i32 %evl)
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll b/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll
index 0901c261af1a..f41a3ec72aed 100644
--- a/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll
@@ -79,18 +79,18 @@ start:
 Cond1:                             ; preds = %start
     %v15 = tail call <vscale x 1 x i16> @llvm.experimental.stepvector.nxv1i16()
     %v17 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv1i16(<vscale x 8 x i16> poison, <vscale x 1 x i16> %v15, i64 0)
-    %vs12.i.i.i = add <vscale x 1 x i16> %v15, shufflevector (<vscale x 1 x i16> insertelement (<vscale x 1 x i16> poison, i16 1, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer)
+    %vs12.i.i.i = add <vscale x 1 x i16> %v15, splat (i16 1)
     %v18 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv1i16(<vscale x 8 x i16> poison, <vscale x 1 x i16> %vs12.i.i.i, i64 0)
-    %vs16.i.i.i = add <vscale x 1 x i16> %v15, shufflevector (<vscale x 1 x i16> insertelement (<vscale x 1 x i16> poison, i16 3, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer)
+    %vs16.i.i.i = add <vscale x 1 x i16> %v15, splat (i16 3)
     %v20 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv1i16(<vscale x 8 x i16> poison, <vscale x 1 x i16> %vs16.i.i.i, i64 0)
     br label %UseSR
 
 Cond2:                           ; preds = %start
     %v15.2 = tail call <vscale x 1 x i16> @llvm.experimental.stepvector.nxv1i16()
     %v17.2 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv1i16(<vscale x 8 x i16> poison, <vscale x 1 x i16> %v15.2, i64 1)
-    %vs12.i.i.i.2 = add <vscale x 1 x i16> %v15.2, shufflevector (<vscale x 1 x i16> insertelement (<vscale x 1 x i16> poison, i16 1, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer)
+    %vs12.i.i.i.2 = add <vscale x 1 x i16> %v15.2, splat (i16 1)
     %v18.2 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv1i16(<vscale x 8 x i16> poison, <vscale x 1 x i16> %vs12.i.i.i.2, i64 1)
-    %vs16.i.i.i.2 = add <vscale x 1 x i16> %v15.2, shufflevector (<vscale x 1 x i16> insertelement (<vscale x 1 x i16> poison, i16 3, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer)
+    %vs16.i.i.i.2 = add <vscale x 1 x i16> %v15.2, splat (i16 3)
     %v20.2 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv1i16(<vscale x 8 x i16> poison, <vscale x 1 x i16> %vs16.i.i.i.2, i64 1)
     br label %UseSR
 
@@ -132,9 +132,9 @@ define internal void @SubRegLivenessUndef() {
 loopIR.preheader.i.i:
   %v15 = tail call <vscale x 1 x i16> @llvm.experimental.stepvector.nxv1i16()
   %v17 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv1i16(<vscale x 8 x i16> poison, <vscale x 1 x i16> %v15, i64 0)
-  %vs12.i.i.i = add <vscale x 1 x i16> %v15, shufflevector (<vscale x 1 x i16> insertelement (<vscale x 1 x i16> poison, i16 1, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer)
+  %vs12.i.i.i = add <vscale x 1 x i16> %v15, splat (i16 1)
   %v18 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv1i16(<vscale x 8 x i16> poison, <vscale x 1 x i16> %vs12.i.i.i, i64 0)
-  %vs16.i.i.i = add <vscale x 1 x i16> %v15, shufflevector (<vscale x 1 x i16> insertelement (<vscale x 1 x i16> poison, i16 3, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer)
+  %vs16.i.i.i = add <vscale x 1 x i16> %v15, splat (i16 3)
   %v20 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv1i16(<vscale x 8 x i16> poison, <vscale x 1 x i16> %vs16.i.i.i, i64 0)
   br label %loopIR3.i.i
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll
index f076c3c621cd..95866543828f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll
@@ -20,7 +20,7 @@ define <vscale x 1 x i8> @vandn_vv_vp_nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 1 x i8> @llvm.vp.xor.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> shufflevector(<vscale x 1 x i8> insertelement(<vscale x 1 x i8> poison, i8 -1, i32 0), <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 1 x i8> @llvm.vp.xor.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> splat (i8 -1), <vscale x 1 x i1> %mask, i32 %evl)
   %x = call <vscale x 1 x i8> @llvm.vp.and.nxv1i8(<vscale x 1 x i8> %not.a, <vscale x 1 x i8> %b, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i8> %x
 }
@@ -38,7 +38,7 @@ define <vscale x 1 x i8> @vandn_vv_vp_swapped_nxv1i8(<vscale x 1 x i8> %a, <vsca
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 1 x i8> @llvm.vp.xor.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> shufflevector(<vscale x 1 x i8> insertelement(<vscale x 1 x i8> poison, i8 -1, i32 0), <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 1 x i8> @llvm.vp.xor.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> splat (i8 -1), <vscale x 1 x i1> %mask, i32 %evl)
   %x = call <vscale x 1 x i8> @llvm.vp.and.nxv1i8(<vscale x 1 x i8> %b, <vscale x 1 x i8> %not.a, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i8> %x
 }
@@ -79,7 +79,7 @@ define <vscale x 2 x i8> @vandn_vv_vp_nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 2 x i8> @llvm.vp.xor.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> shufflevector(<vscale x 2 x i8> insertelement(<vscale x 2 x i8> poison, i8 -1, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 2 x i8> @llvm.vp.xor.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> splat (i8 -1), <vscale x 2 x i1> %mask, i32 %evl)
   %x = call <vscale x 2 x i8> @llvm.vp.and.nxv2i8(<vscale x 2 x i8> %not.a, <vscale x 2 x i8> %b, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i8> %x
 }
@@ -97,7 +97,7 @@ define <vscale x 2 x i8> @vandn_vv_vp_swapped_nxv2i8(<vscale x 2 x i8> %a, <vsca
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 2 x i8> @llvm.vp.xor.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> shufflevector(<vscale x 2 x i8> insertelement(<vscale x 2 x i8> poison, i8 -1, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 2 x i8> @llvm.vp.xor.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> splat (i8 -1), <vscale x 2 x i1> %mask, i32 %evl)
   %x = call <vscale x 2 x i8> @llvm.vp.and.nxv2i8(<vscale x 2 x i8> %b, <vscale x 2 x i8> %not.a, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i8> %x
 }
@@ -138,7 +138,7 @@ define <vscale x 4 x i8> @vandn_vv_vp_nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 4 x i8> @llvm.vp.xor.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> shufflevector(<vscale x 4 x i8> insertelement(<vscale x 4 x i8> poison, i8 -1, i32 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 4 x i8> @llvm.vp.xor.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> splat (i8 -1), <vscale x 4 x i1> %mask, i32 %evl)
   %x = call <vscale x 4 x i8> @llvm.vp.and.nxv4i8(<vscale x 4 x i8> %not.a, <vscale x 4 x i8> %b, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i8> %x
 }
@@ -156,7 +156,7 @@ define <vscale x 4 x i8> @vandn_vv_vp_swapped_nxv4i8(<vscale x 4 x i8> %a, <vsca
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 4 x i8> @llvm.vp.xor.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> shufflevector(<vscale x 4 x i8> insertelement(<vscale x 4 x i8> poison, i8 -1, i32 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 4 x i8> @llvm.vp.xor.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> splat (i8 -1), <vscale x 4 x i1> %mask, i32 %evl)
   %x = call <vscale x 4 x i8> @llvm.vp.and.nxv4i8(<vscale x 4 x i8> %b, <vscale x 4 x i8> %not.a, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i8> %x
 }
@@ -197,7 +197,7 @@ define <vscale x 8 x i8> @vandn_vv_vp_nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 8 x i8> @llvm.vp.xor.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> shufflevector(<vscale x 8 x i8> insertelement(<vscale x 8 x i8> poison, i8 -1, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 8 x i8> @llvm.vp.xor.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> splat (i8 -1), <vscale x 8 x i1> %mask, i32 %evl)
   %x = call <vscale x 8 x i8> @llvm.vp.and.nxv8i8(<vscale x 8 x i8> %not.a, <vscale x 8 x i8> %b, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i8> %x
 }
@@ -215,7 +215,7 @@ define <vscale x 8 x i8> @vandn_vv_vp_swapped_nxv8i8(<vscale x 8 x i8> %a, <vsca
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 8 x i8> @llvm.vp.xor.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> shufflevector(<vscale x 8 x i8> insertelement(<vscale x 8 x i8> poison, i8 -1, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 8 x i8> @llvm.vp.xor.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> splat (i8 -1), <vscale x 8 x i1> %mask, i32 %evl)
   %x = call <vscale x 8 x i8> @llvm.vp.and.nxv8i8(<vscale x 8 x i8> %b, <vscale x 8 x i8> %not.a, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i8> %x
 }
@@ -256,7 +256,7 @@ define <vscale x 16 x i8> @vandn_vv_vp_nxv16i8(<vscale x 16 x i8> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 16 x i8> @llvm.vp.xor.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> shufflevector(<vscale x 16 x i8> insertelement(<vscale x 16 x i8> poison, i8 -1, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 16 x i8> @llvm.vp.xor.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> splat (i8 -1), <vscale x 16 x i1> %mask, i32 %evl)
   %x = call <vscale x 16 x i8> @llvm.vp.and.nxv16i8(<vscale x 16 x i8> %not.a, <vscale x 16 x i8> %b, <vscale x 16 x i1> %mask, i32 %evl)
   ret <vscale x 16 x i8> %x
 }
@@ -274,7 +274,7 @@ define <vscale x 16 x i8> @vandn_vv_vp_swapped_nxv16i8(<vscale x 16 x i8> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 16 x i8> @llvm.vp.xor.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> shufflevector(<vscale x 16 x i8> insertelement(<vscale x 16 x i8> poison, i8 -1, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 16 x i8> @llvm.vp.xor.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> splat (i8 -1), <vscale x 16 x i1> %mask, i32 %evl)
   %x = call <vscale x 16 x i8> @llvm.vp.and.nxv16i8(<vscale x 16 x i8> %b, <vscale x 16 x i8> %not.a, <vscale x 16 x i1> %mask, i32 %evl)
   ret <vscale x 16 x i8> %x
 }
@@ -315,7 +315,7 @@ define <vscale x 32 x i8> @vandn_vv_vp_nxv32i8(<vscale x 32 x i8> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 32 x i8> @llvm.vp.xor.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> shufflevector(<vscale x 32 x i8> insertelement(<vscale x 32 x i8> poison, i8 -1, i32 0), <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer), <vscale x 32 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 32 x i8> @llvm.vp.xor.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> splat (i8 -1), <vscale x 32 x i1> %mask, i32 %evl)
   %x = call <vscale x 32 x i8> @llvm.vp.and.nxv32i8(<vscale x 32 x i8> %not.a, <vscale x 32 x i8> %b, <vscale x 32 x i1> %mask, i32 %evl)
   ret <vscale x 32 x i8> %x
 }
@@ -333,7 +333,7 @@ define <vscale x 32 x i8> @vandn_vv_vp_swapped_nxv32i8(<vscale x 32 x i8> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 32 x i8> @llvm.vp.xor.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> shufflevector(<vscale x 32 x i8> insertelement(<vscale x 32 x i8> poison, i8 -1, i32 0), <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer), <vscale x 32 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 32 x i8> @llvm.vp.xor.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> splat (i8 -1), <vscale x 32 x i1> %mask, i32 %evl)
   %x = call <vscale x 32 x i8> @llvm.vp.and.nxv32i8(<vscale x 32 x i8> %b, <vscale x 32 x i8> %not.a, <vscale x 32 x i1> %mask, i32 %evl)
   ret <vscale x 32 x i8> %x
 }
@@ -374,7 +374,7 @@ define <vscale x 64 x i8> @vandn_vv_vp_nxv64i8(<vscale x 64 x i8> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 64 x i8> @llvm.vp.xor.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> shufflevector(<vscale x 64 x i8> insertelement(<vscale x 64 x i8> poison, i8 -1, i32 0), <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer), <vscale x 64 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 64 x i8> @llvm.vp.xor.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> splat (i8 -1), <vscale x 64 x i1> %mask, i32 %evl)
   %x = call <vscale x 64 x i8> @llvm.vp.and.nxv64i8(<vscale x 64 x i8> %not.a, <vscale x 64 x i8> %b, <vscale x 64 x i1> %mask, i32 %evl)
   ret <vscale x 64 x i8> %x
 }
@@ -392,7 +392,7 @@ define <vscale x 64 x i8> @vandn_vv_vp_swapped_nxv64i8(<vscale x 64 x i8> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 64 x i8> @llvm.vp.xor.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> shufflevector(<vscale x 64 x i8> insertelement(<vscale x 64 x i8> poison, i8 -1, i32 0), <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer), <vscale x 64 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 64 x i8> @llvm.vp.xor.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> splat (i8 -1), <vscale x 64 x i1> %mask, i32 %evl)
   %x = call <vscale x 64 x i8> @llvm.vp.and.nxv64i8(<vscale x 64 x i8> %b, <vscale x 64 x i8> %not.a, <vscale x 64 x i1> %mask, i32 %evl)
   ret <vscale x 64 x i8> %x
 }
@@ -433,7 +433,7 @@ define <vscale x 1 x i16> @vandn_vv_vp_nxv1i16(<vscale x 1 x i16> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 1 x i16> @llvm.vp.xor.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> shufflevector(<vscale x 1 x i16> insertelement(<vscale x 1 x i16> poison, i16 -1, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 1 x i16> @llvm.vp.xor.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> splat (i16 -1), <vscale x 1 x i1> %mask, i32 %evl)
   %x = call <vscale x 1 x i16> @llvm.vp.and.nxv1i16(<vscale x 1 x i16> %not.a, <vscale x 1 x i16> %b, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i16> %x
 }
@@ -451,7 +451,7 @@ define <vscale x 1 x i16> @vandn_vv_vp_swapped_nxv1i16(<vscale x 1 x i16> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 1 x i16> @llvm.vp.xor.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> shufflevector(<vscale x 1 x i16> insertelement(<vscale x 1 x i16> poison, i16 -1, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 1 x i16> @llvm.vp.xor.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> splat (i16 -1), <vscale x 1 x i1> %mask, i32 %evl)
   %x = call <vscale x 1 x i16> @llvm.vp.and.nxv1i16(<vscale x 1 x i16> %b, <vscale x 1 x i16> %not.a, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i16> %x
 }
@@ -492,7 +492,7 @@ define <vscale x 2 x i16> @vandn_vv_vp_nxv2i16(<vscale x 2 x i16> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 2 x i16> @llvm.vp.xor.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> shufflevector(<vscale x 2 x i16> insertelement(<vscale x 2 x i16> poison, i16 -1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 2 x i16> @llvm.vp.xor.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> splat (i16 -1), <vscale x 2 x i1> %mask, i32 %evl)
   %x = call <vscale x 2 x i16> @llvm.vp.and.nxv2i16(<vscale x 2 x i16> %not.a, <vscale x 2 x i16> %b, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i16> %x
 }
@@ -510,7 +510,7 @@ define <vscale x 2 x i16> @vandn_vv_vp_swapped_nxv2i16(<vscale x 2 x i16> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 2 x i16> @llvm.vp.xor.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> shufflevector(<vscale x 2 x i16> insertelement(<vscale x 2 x i16> poison, i16 -1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 2 x i16> @llvm.vp.xor.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> splat (i16 -1), <vscale x 2 x i1> %mask, i32 %evl)
   %x = call <vscale x 2 x i16> @llvm.vp.and.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> %not.a, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i16> %x
 }
@@ -551,7 +551,7 @@ define <vscale x 4 x i16> @vandn_vv_vp_nxv4i16(<vscale x 4 x i16> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 4 x i16> @llvm.vp.xor.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> shufflevector(<vscale x 4 x i16> insertelement(<vscale x 4 x i16> poison, i16 -1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 4 x i16> @llvm.vp.xor.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> splat (i16 -1), <vscale x 4 x i1> %mask, i32 %evl)
   %x = call <vscale x 4 x i16> @llvm.vp.and.nxv4i16(<vscale x 4 x i16> %not.a, <vscale x 4 x i16> %b, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i16> %x
 }
@@ -569,7 +569,7 @@ define <vscale x 4 x i16> @vandn_vv_vp_swapped_nxv4i16(<vscale x 4 x i16> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 4 x i16> @llvm.vp.xor.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> shufflevector(<vscale x 4 x i16> insertelement(<vscale x 4 x i16> poison, i16 -1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 4 x i16> @llvm.vp.xor.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> splat (i16 -1), <vscale x 4 x i1> %mask, i32 %evl)
   %x = call <vscale x 4 x i16> @llvm.vp.and.nxv4i16(<vscale x 4 x i16> %b, <vscale x 4 x i16> %not.a, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i16> %x
 }
@@ -610,7 +610,7 @@ define <vscale x 8 x i16> @vandn_vv_vp_nxv8i16(<vscale x 8 x i16> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 8 x i16> @llvm.vp.xor.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> shufflevector(<vscale x 8 x i16> insertelement(<vscale x 8 x i16> poison, i16 -1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 8 x i16> @llvm.vp.xor.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> splat (i16 -1), <vscale x 8 x i1> %mask, i32 %evl)
   %x = call <vscale x 8 x i16> @llvm.vp.and.nxv8i16(<vscale x 8 x i16> %not.a, <vscale x 8 x i16> %b, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i16> %x
 }
@@ -628,7 +628,7 @@ define <vscale x 8 x i16> @vandn_vv_vp_swapped_nxv8i16(<vscale x 8 x i16> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 8 x i16> @llvm.vp.xor.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> shufflevector(<vscale x 8 x i16> insertelement(<vscale x 8 x i16> poison, i16 -1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 8 x i16> @llvm.vp.xor.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> splat (i16 -1), <vscale x 8 x i1> %mask, i32 %evl)
   %x = call <vscale x 8 x i16> @llvm.vp.and.nxv8i16(<vscale x 8 x i16> %b, <vscale x 8 x i16> %not.a, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i16> %x
 }
@@ -669,7 +669,7 @@ define <vscale x 16 x i16> @vandn_vv_vp_nxv16i16(<vscale x 16 x i16> %a, <vscale
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 16 x i16> @llvm.vp.xor.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> shufflevector(<vscale x 16 x i16> insertelement(<vscale x 16 x i16> poison, i16 -1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 16 x i16> @llvm.vp.xor.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> splat (i16 -1), <vscale x 16 x i1> %mask, i32 %evl)
   %x = call <vscale x 16 x i16> @llvm.vp.and.nxv16i16(<vscale x 16 x i16> %not.a, <vscale x 16 x i16> %b, <vscale x 16 x i1> %mask, i32 %evl)
   ret <vscale x 16 x i16> %x
 }
@@ -687,7 +687,7 @@ define <vscale x 16 x i16> @vandn_vv_vp_swapped_nxv16i16(<vscale x 16 x i16> %a,
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 16 x i16> @llvm.vp.xor.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> shufflevector(<vscale x 16 x i16> insertelement(<vscale x 16 x i16> poison, i16 -1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 16 x i16> @llvm.vp.xor.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> splat (i16 -1), <vscale x 16 x i1> %mask, i32 %evl)
   %x = call <vscale x 16 x i16> @llvm.vp.and.nxv16i16(<vscale x 16 x i16> %b, <vscale x 16 x i16> %not.a, <vscale x 16 x i1> %mask, i32 %evl)
   ret <vscale x 16 x i16> %x
 }
@@ -728,7 +728,7 @@ define <vscale x 32 x i16> @vandn_vv_vp_nxv32i16(<vscale x 32 x i16> %a, <vscale
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 32 x i16> @llvm.vp.xor.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> shufflevector(<vscale x 32 x i16> insertelement(<vscale x 32 x i16> poison, i16 -1, i32 0), <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer), <vscale x 32 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 32 x i16> @llvm.vp.xor.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> splat (i16 -1), <vscale x 32 x i1> %mask, i32 %evl)
   %x = call <vscale x 32 x i16> @llvm.vp.and.nxv32i16(<vscale x 32 x i16> %not.a, <vscale x 32 x i16> %b, <vscale x 32 x i1> %mask, i32 %evl)
   ret <vscale x 32 x i16> %x
 }
@@ -746,7 +746,7 @@ define <vscale x 32 x i16> @vandn_vv_vp_swapped_nxv32i16(<vscale x 32 x i16> %a,
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 32 x i16> @llvm.vp.xor.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> shufflevector(<vscale x 32 x i16> insertelement(<vscale x 32 x i16> poison, i16 -1, i32 0), <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer), <vscale x 32 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 32 x i16> @llvm.vp.xor.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> splat (i16 -1), <vscale x 32 x i1> %mask, i32 %evl)
   %x = call <vscale x 32 x i16> @llvm.vp.and.nxv32i16(<vscale x 32 x i16> %b, <vscale x 32 x i16> %not.a, <vscale x 32 x i1> %mask, i32 %evl)
   ret <vscale x 32 x i16> %x
 }
@@ -787,7 +787,7 @@ define <vscale x 1 x i32> @vandn_vv_vp_nxv1i32(<vscale x 1 x i32> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 1 x i32> @llvm.vp.xor.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> shufflevector(<vscale x 1 x i32> insertelement(<vscale x 1 x i32> poison, i32 -1, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 1 x i32> @llvm.vp.xor.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> splat (i32 -1), <vscale x 1 x i1> %mask, i32 %evl)
   %x = call <vscale x 1 x i32> @llvm.vp.and.nxv1i32(<vscale x 1 x i32> %not.a, <vscale x 1 x i32> %b, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i32> %x
 }
@@ -805,7 +805,7 @@ define <vscale x 1 x i32> @vandn_vv_vp_swapped_nxv1i32(<vscale x 1 x i32> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 1 x i32> @llvm.vp.xor.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> shufflevector(<vscale x 1 x i32> insertelement(<vscale x 1 x i32> poison, i32 -1, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 1 x i32> @llvm.vp.xor.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> splat (i32 -1), <vscale x 1 x i1> %mask, i32 %evl)
   %x = call <vscale x 1 x i32> @llvm.vp.and.nxv1i32(<vscale x 1 x i32> %b, <vscale x 1 x i32> %not.a, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i32> %x
 }
@@ -846,7 +846,7 @@ define <vscale x 2 x i32> @vandn_vv_vp_nxv2i32(<vscale x 2 x i32> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 2 x i32> @llvm.vp.xor.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> shufflevector(<vscale x 2 x i32> insertelement(<vscale x 2 x i32> poison, i32 -1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 2 x i32> @llvm.vp.xor.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> splat (i32 -1), <vscale x 2 x i1> %mask, i32 %evl)
   %x = call <vscale x 2 x i32> @llvm.vp.and.nxv2i32(<vscale x 2 x i32> %not.a, <vscale x 2 x i32> %b, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i32> %x
 }
@@ -864,7 +864,7 @@ define <vscale x 2 x i32> @vandn_vv_vp_swapped_nxv2i32(<vscale x 2 x i32> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 2 x i32> @llvm.vp.xor.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> shufflevector(<vscale x 2 x i32> insertelement(<vscale x 2 x i32> poison, i32 -1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 2 x i32> @llvm.vp.xor.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> splat (i32 -1), <vscale x 2 x i1> %mask, i32 %evl)
   %x = call <vscale x 2 x i32> @llvm.vp.and.nxv2i32(<vscale x 2 x i32> %b, <vscale x 2 x i32> %not.a, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i32> %x
 }
@@ -905,7 +905,7 @@ define <vscale x 4 x i32> @vandn_vv_vp_nxv4i32(<vscale x 4 x i32> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 4 x i32> @llvm.vp.xor.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector(<vscale x 4 x i32> insertelement(<vscale x 4 x i32> poison, i32 -1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 4 x i32> @llvm.vp.xor.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 -1), <vscale x 4 x i1> %mask, i32 %evl)
   %x = call <vscale x 4 x i32> @llvm.vp.and.nxv4i32(<vscale x 4 x i32> %not.a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i32> %x
 }
@@ -923,7 +923,7 @@ define <vscale x 4 x i32> @vandn_vv_vp_swapped_nxv4i32(<vscale x 4 x i32> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 4 x i32> @llvm.vp.xor.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector(<vscale x 4 x i32> insertelement(<vscale x 4 x i32> poison, i32 -1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 4 x i32> @llvm.vp.xor.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 -1), <vscale x 4 x i1> %mask, i32 %evl)
   %x = call <vscale x 4 x i32> @llvm.vp.and.nxv4i32(<vscale x 4 x i32> %b, <vscale x 4 x i32> %not.a, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i32> %x
 }
@@ -964,7 +964,7 @@ define <vscale x 8 x i32> @vandn_vv_vp_nxv8i32(<vscale x 8 x i32> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 8 x i32> @llvm.vp.xor.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> shufflevector(<vscale x 8 x i32> insertelement(<vscale x 8 x i32> poison, i32 -1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 8 x i32> @llvm.vp.xor.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> splat (i32 -1), <vscale x 8 x i1> %mask, i32 %evl)
   %x = call <vscale x 8 x i32> @llvm.vp.and.nxv8i32(<vscale x 8 x i32> %not.a, <vscale x 8 x i32> %b, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i32> %x
 }
@@ -982,7 +982,7 @@ define <vscale x 8 x i32> @vandn_vv_vp_swapped_nxv8i32(<vscale x 8 x i32> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 8 x i32> @llvm.vp.xor.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> shufflevector(<vscale x 8 x i32> insertelement(<vscale x 8 x i32> poison, i32 -1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 8 x i32> @llvm.vp.xor.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> splat (i32 -1), <vscale x 8 x i1> %mask, i32 %evl)
   %x = call <vscale x 8 x i32> @llvm.vp.and.nxv8i32(<vscale x 8 x i32> %b, <vscale x 8 x i32> %not.a, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i32> %x
 }
@@ -1023,7 +1023,7 @@ define <vscale x 16 x i32> @vandn_vv_vp_nxv16i32(<vscale x 16 x i32> %a, <vscale
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 16 x i32> @llvm.vp.xor.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> shufflevector(<vscale x 16 x i32> insertelement(<vscale x 16 x i32> poison, i32 -1, i32 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 16 x i32> @llvm.vp.xor.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> splat (i32 -1), <vscale x 16 x i1> %mask, i32 %evl)
   %x = call <vscale x 16 x i32> @llvm.vp.and.nxv16i32(<vscale x 16 x i32> %not.a, <vscale x 16 x i32> %b, <vscale x 16 x i1> %mask, i32 %evl)
   ret <vscale x 16 x i32> %x
 }
@@ -1041,7 +1041,7 @@ define <vscale x 16 x i32> @vandn_vv_vp_swapped_nxv16i32(<vscale x 16 x i32> %a,
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 16 x i32> @llvm.vp.xor.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> shufflevector(<vscale x 16 x i32> insertelement(<vscale x 16 x i32> poison, i32 -1, i32 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 16 x i32> @llvm.vp.xor.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> splat (i32 -1), <vscale x 16 x i1> %mask, i32 %evl)
   %x = call <vscale x 16 x i32> @llvm.vp.and.nxv16i32(<vscale x 16 x i32> %b, <vscale x 16 x i32> %not.a, <vscale x 16 x i1> %mask, i32 %evl)
   ret <vscale x 16 x i32> %x
 }
@@ -1082,7 +1082,7 @@ define <vscale x 1 x i64> @vandn_vv_vp_nxv1i64(<vscale x 1 x i64> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 -1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> splat (i64 -1), <vscale x 1 x i1> %mask, i32 %evl)
   %x = call <vscale x 1 x i64> @llvm.vp.and.nxv1i64(<vscale x 1 x i64> %not.a, <vscale x 1 x i64> %b, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %x
 }
@@ -1100,7 +1100,7 @@ define <vscale x 1 x i64> @vandn_vv_vp_swapped_nxv1i64(<vscale x 1 x i64> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 -1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> splat (i64 -1), <vscale x 1 x i1> %mask, i32 %evl)
   %x = call <vscale x 1 x i64> @llvm.vp.and.nxv1i64(<vscale x 1 x i64> %b, <vscale x 1 x i64> %not.a, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %x
 }
@@ -1173,7 +1173,7 @@ define <vscale x 2 x i64> @vandn_vv_vp_nxv2i64(<vscale x 2 x i64> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 2 x i64> @llvm.vp.xor.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 -1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 2 x i64> @llvm.vp.xor.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 -1), <vscale x 2 x i1> %mask, i32 %evl)
   %x = call <vscale x 2 x i64> @llvm.vp.and.nxv2i64(<vscale x 2 x i64> %not.a, <vscale x 2 x i64> %b, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i64> %x
 }
@@ -1191,7 +1191,7 @@ define <vscale x 2 x i64> @vandn_vv_vp_swapped_nxv2i64(<vscale x 2 x i64> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 2 x i64> @llvm.vp.xor.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 -1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 2 x i64> @llvm.vp.xor.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 -1), <vscale x 2 x i1> %mask, i32 %evl)
   %x = call <vscale x 2 x i64> @llvm.vp.and.nxv2i64(<vscale x 2 x i64> %b, <vscale x 2 x i64> %not.a, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i64> %x
 }
@@ -1264,7 +1264,7 @@ define <vscale x 4 x i64> @vandn_vv_vp_nxv4i64(<vscale x 4 x i64> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 4 x i64> @llvm.vp.xor.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> shufflevector(<vscale x 4 x i64> insertelement(<vscale x 4 x i64> poison, i64 -1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 4 x i64> @llvm.vp.xor.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> splat (i64 -1), <vscale x 4 x i1> %mask, i32 %evl)
   %x = call <vscale x 4 x i64> @llvm.vp.and.nxv4i64(<vscale x 4 x i64> %not.a, <vscale x 4 x i64> %b, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i64> %x
 }
@@ -1282,7 +1282,7 @@ define <vscale x 4 x i64> @vandn_vv_vp_swapped_nxv4i64(<vscale x 4 x i64> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 4 x i64> @llvm.vp.xor.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> shufflevector(<vscale x 4 x i64> insertelement(<vscale x 4 x i64> poison, i64 -1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 4 x i64> @llvm.vp.xor.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> splat (i64 -1), <vscale x 4 x i1> %mask, i32 %evl)
   %x = call <vscale x 4 x i64> @llvm.vp.and.nxv4i64(<vscale x 4 x i64> %b, <vscale x 4 x i64> %not.a, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i64> %x
 }
@@ -1355,7 +1355,7 @@ define <vscale x 8 x i64> @vandn_vv_vp_nxv8i64(<vscale x 8 x i64> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 8 x i64> @llvm.vp.xor.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> shufflevector(<vscale x 8 x i64> insertelement(<vscale x 8 x i64> poison, i64 -1, i32 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 8 x i64> @llvm.vp.xor.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> splat (i64 -1), <vscale x 8 x i1> %mask, i32 %evl)
   %x = call <vscale x 8 x i64> @llvm.vp.and.nxv8i64(<vscale x 8 x i64> %not.a, <vscale x 8 x i64> %b, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i64> %x
 }
@@ -1373,7 +1373,7 @@ define <vscale x 8 x i64> @vandn_vv_vp_swapped_nxv8i64(<vscale x 8 x i64> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 8 x i64> @llvm.vp.xor.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> shufflevector(<vscale x 8 x i64> insertelement(<vscale x 8 x i64> poison, i64 -1, i32 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 8 x i64> @llvm.vp.xor.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> splat (i64 -1), <vscale x 8 x i1> %mask, i32 %evl)
   %x = call <vscale x 8 x i64> @llvm.vp.and.nxv8i64(<vscale x 8 x i64> %b, <vscale x 8 x i64> %not.a, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i64> %x
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
index c23c10205e6e..4c64b1677b36 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
@@ -22,7 +22,7 @@ define void @vector_interleave_store_nxv32i1_nxv16i1(<vscale x 16 x i1> %a, <vsc
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a1, a1, 2
 ; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v8, a1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vsm.v v9, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
index e84fd1b1a703..1acc0fec8fe5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
@@ -24,7 +24,7 @@ define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1>
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v0, v8, a0
 ; CHECK-NEXT:    ret
 ;
@@ -44,7 +44,7 @@ define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    srli a0, a0, 2
 ; ZVBB-NEXT:    add a1, a0, a0
-; ZVBB-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; ZVBB-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v0, v8, a0
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
@@ -376,9 +376,9 @@ define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v10, a0
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_nxv4f16_nxv2f16:
@@ -391,9 +391,9 @@ define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half
 ; ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslidedown.vx v8, v10, a0
 ; ZVBB-NEXT:    add a1, a0, a0
-; ZVBB-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; ZVBB-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v8, a0
-; ZVBB-NEXT:    vmv1r.v v8, v10
+; ZVBB-NEXT:    vmv.v.v v8, v10
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
   ret <vscale x 4 x half> %res
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
index 4440ea56ba90..5cfa98916a2d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
@@ -24,7 +24,7 @@ define <vscale x 2 x float> @vfpext_nxv2f16_nxv2f32_unmasked(<vscale x 2 x half>
 ; CHECK-NEXT:    vfwcvt.f.f.v v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x float> @llvm.vp.fpext.nxv2f32.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x float> @llvm.vp.fpext.nxv2f32.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x float> %v
 }
 
@@ -50,7 +50,7 @@ define <vscale x 2 x double> @vfpext_nxv2f16_nxv2f64_unmasked(<vscale x 2 x half
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.f.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x double> @llvm.vp.fpext.nxv2f64.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x double> @llvm.vp.fpext.nxv2f64.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x double> %v
 }
 
@@ -74,7 +74,7 @@ define <vscale x 2 x double> @vfpext_nxv2f32_nxv2f64_unmasked(<vscale x 2 x floa
 ; CHECK-NEXT:    vfwcvt.f.f.v v10, v8
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x double> @llvm.vp.fpext.nxv2f64.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x double> @llvm.vp.fpext.nxv2f64.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x double> %v
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
index 8e983f63428a..b888fde7d068 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
@@ -937,7 +937,7 @@ define <vscale x 32 x i1> @vfptosi_nxv32f16_nxv32i1(<vscale x 32 x half> %va) {
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v24
 ; ZVFHMIN-NEXT:    vand.vi v8, v8, 1
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v0, v16, a0
 ; ZVFHMIN-NEXT:    ret
   %evec = fptosi <vscale x 32 x half> %va to <vscale x 32 x i1>
@@ -967,7 +967,7 @@ define <vscale x 32 x i1> @vfptoui_nxv32f16_nxv32i1(<vscale x 32 x half> %va) {
 ; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v8, v24
 ; ZVFHMIN-NEXT:    vand.vi v8, v8, 1
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v0, v16, a0
 ; ZVFHMIN-NEXT:    ret
   %evec = fptoui <vscale x 32 x half> %va to <vscale x 32 x i1>
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll
index 9061c38975e2..e5048eaf9d0c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll
@@ -42,7 +42,7 @@ define <vscale x 2 x i1> @vfptosi_nxv2i1_nxv2f16_unmasked(<vscale x 2 x half> %v
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v9
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i1> %v
 }
 
@@ -66,7 +66,7 @@ define <vscale x 2 x i1> @vfptosi_nxv2i1_nxv2f32_unmasked(<vscale x 2 x float> %
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v8
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i1> %v
 }
 
@@ -91,6 +91,6 @@ define <vscale x 2 x i1> @vfptosi_nxv2i1_nxv2f64_unmasked(<vscale x 2 x double>
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v8
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i1> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
index 9e7d6f92d84e..15c4bf255e6d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
@@ -67,7 +67,7 @@ define <vscale x 2 x i8> @vfptosi_nxv2i8_nxv2f16_unmasked(<vscale x 2 x half> %v
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vnsrl.wi v8, v8, 0
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i8> %v
 }
 
@@ -105,7 +105,7 @@ define <vscale x 2 x i16> @vfptosi_nxv2i16_nxv2f16_unmasked(<vscale x 2 x half>
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i16> %v
 }
 
@@ -145,7 +145,7 @@ define <vscale x 2 x i32> @vfptosi_nxv2i32_nxv2f16_unmasked(<vscale x 2 x half>
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v9
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i32> %v
 }
 
@@ -187,7 +187,7 @@ define <vscale x 2 x i64> @vfptosi_nxv2i64_nxv2f16_unmasked(<vscale x 2 x half>
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.rtz.x.f.v v8, v10
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i64> %v
 }
 
@@ -213,7 +213,7 @@ define <vscale x 2 x i8> @vfptosi_nxv2i8_nxv2f32_unmasked(<vscale x 2 x float> %
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v9, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i8> %v
 }
 
@@ -237,7 +237,7 @@ define <vscale x 2 x i16> @vfptosi_nxv2i16_nxv2f32_unmasked(<vscale x 2 x float>
 ; CHECK-NEXT:    vfncvt.rtz.x.f.w v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i16> %v
 }
 
@@ -259,7 +259,7 @@ define <vscale x 2 x i32> @vfptosi_nxv2i32_nxv2f32_unmasked(<vscale x 2 x float>
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i32> %v
 }
 
@@ -283,7 +283,7 @@ define <vscale x 2 x i64> @vfptosi_nxv2i64_nxv2f32_unmasked(<vscale x 2 x float>
 ; CHECK-NEXT:    vfwcvt.rtz.x.f.v v10, v8
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i64> %v
 }
 
@@ -313,7 +313,7 @@ define <vscale x 2 x i8> @vfptosi_nxv2i8_nxv2f64_unmasked(<vscale x 2 x double>
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i8> %v
 }
 
@@ -339,7 +339,7 @@ define <vscale x 2 x i16> @vfptosi_nxv2i16_nxv2f64_unmasked(<vscale x 2 x double
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v10, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i16> %v
 }
 
@@ -363,7 +363,7 @@ define <vscale x 2 x i32> @vfptosi_nxv2i32_nxv2f64_unmasked(<vscale x 2 x double
 ; CHECK-NEXT:    vfncvt.rtz.x.f.w v10, v8
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i32> %v
 }
 
@@ -385,7 +385,7 @@ define <vscale x 2 x i64> @vfptosi_nxv2i64_nxv2f64_unmasked(<vscale x 2 x double
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i64> %v
 }
 
@@ -466,6 +466,6 @@ define <vscale x 32 x i32> @vfptosi_nxv32i32_nxv32f32_unmasked(<vscale x 32 x fl
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> shufflevector (<vscale x 32 x i1> insertelement (<vscale x 32 x i1> undef, i1 true, i32 0), <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x i32> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll
index 6646171fcd15..4b609d07c1e7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll
@@ -42,7 +42,7 @@ define <vscale x 2 x i1> @vfptoui_nxv2i1_nxv2f16_unmasked(<vscale x 2 x half> %v
 ; ZVFHMIN-NEXT:    vfcvt.rtz.xu.f.v v8, v9
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i1> %v
 }
 
@@ -66,7 +66,7 @@ define <vscale x 2 x i1> @vfptoui_nxv2i1_nxv2f32_unmasked(<vscale x 2 x float> %
 ; CHECK-NEXT:    vfcvt.rtz.xu.f.v v8, v8
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i1> %v
 }
 
@@ -91,6 +91,6 @@ define <vscale x 2 x i1> @vfptoui_nxv2i1_nxv2f64_unmasked(<vscale x 2 x double>
 ; CHECK-NEXT:    vfcvt.rtz.xu.f.v v8, v8
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i1> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
index 486efbe66a6f..a2591e7dc35f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
@@ -67,7 +67,7 @@ define <vscale x 2 x i8> @vfptoui_nxv2i8_nxv2f16_unmasked(<vscale x 2 x half> %v
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vnsrl.wi v8, v8, 0
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i8> %v
 }
 
@@ -105,7 +105,7 @@ define <vscale x 2 x i16> @vfptoui_nxv2i16_nxv2f16_unmasked(<vscale x 2 x half>
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i16> %v
 }
 
@@ -145,7 +145,7 @@ define <vscale x 2 x i32> @vfptoui_nxv2i32_nxv2f16_unmasked(<vscale x 2 x half>
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.xu.f.v v8, v9
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i32> %v
 }
 
@@ -187,7 +187,7 @@ define <vscale x 2 x i64> @vfptoui_nxv2i64_nxv2f16_unmasked(<vscale x 2 x half>
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.rtz.xu.f.v v8, v10
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i64> %v
 }
 
@@ -213,7 +213,7 @@ define <vscale x 2 x i8> @vfptoui_nxv2i8_nxv2f32_unmasked(<vscale x 2 x float> %
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v9, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i8> %v
 }
 
@@ -237,7 +237,7 @@ define <vscale x 2 x i16> @vfptoui_nxv2i16_nxv2f32_unmasked(<vscale x 2 x float>
 ; CHECK-NEXT:    vfncvt.rtz.xu.f.w v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i16> %v
 }
 
@@ -259,7 +259,7 @@ define <vscale x 2 x i32> @vfptoui_nxv2i32_nxv2f32_unmasked(<vscale x 2 x float>
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.xu.f.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i32> %v
 }
 
@@ -283,7 +283,7 @@ define <vscale x 2 x i64> @vfptoui_nxv2i64_nxv2f32_unmasked(<vscale x 2 x float>
 ; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v10, v8
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i64> %v
 }
 
@@ -313,7 +313,7 @@ define <vscale x 2 x i8> @vfptoui_nxv2i8_nxv2f64_unmasked(<vscale x 2 x double>
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i8> %v
 }
 
@@ -339,7 +339,7 @@ define <vscale x 2 x i16> @vfptoui_nxv2i16_nxv2f64_unmasked(<vscale x 2 x double
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v10, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i16> %v
 }
 
@@ -363,7 +363,7 @@ define <vscale x 2 x i32> @vfptoui_nxv2i32_nxv2f64_unmasked(<vscale x 2 x double
 ; CHECK-NEXT:    vfncvt.rtz.xu.f.w v10, v8
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i32> %v
 }
 
@@ -385,7 +385,7 @@ define <vscale x 2 x i64> @vfptoui_nxv2i64_nxv2f64_unmasked(<vscale x 2 x double
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.xu.f.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i64> %v
 }
 
@@ -466,6 +466,6 @@ define <vscale x 32 x i32> @vfptoui_nxv32i32_nxv32f32_unmasked(<vscale x 32 x fl
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.xu.f.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> shufflevector (<vscale x 32 x i1> insertelement (<vscale x 32 x i1> undef, i1 true, i32 0), <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x i32> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
index c6554561be33..4e84a31d71b5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
@@ -24,7 +24,7 @@ define <vscale x 2 x half> @vfptrunc_nxv2f16_nxv2f32_unmasked(<vscale x 2 x floa
 ; CHECK-NEXT:    vfncvt.f.f.w v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x half> @llvm.vp.fptrunc.nxv2f16.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x half> @llvm.vp.fptrunc.nxv2f16.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x half> %v
 }
 
@@ -50,7 +50,7 @@ define <vscale x 2 x half> @vfptrunc_nxv2f16_nxv2f64_unmasked(<vscale x 2 x doub
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvt.f.f.w v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x half> @llvm.vp.fptrunc.nxv2f16.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x half> @llvm.vp.fptrunc.nxv2f16.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x half> %v
 }
 
@@ -74,7 +74,7 @@ define <vscale x 2 x float> @vfptrunc_nxv2f32_nxv2f64_unmasked(<vscale x 2 x dou
 ; CHECK-NEXT:    vfncvt.f.f.w v10, v8
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x float> @llvm.vp.fptrunc.nxv2f64.nxv2f32(<vscale x 2 x double> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x float> @llvm.vp.fptrunc.nxv2f64.nxv2f32(<vscale x 2 x double> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x float> %v
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwadd-vp.ll
index 64887da78cb7..1ef0ed858d80 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwadd-vp.ll
@@ -22,8 +22,8 @@ define <vscale x 2 x float> @vfwadd_same_operand(<vscale x 2 x half> %arg, i32 s
 ; ZVFHMIN-NEXT:    vfadd.vv v8, v9, v9
 ; ZVFHMIN-NEXT:    ret
 bb:
-  %tmp = call <vscale x 2 x float> @llvm.vp.fpext.nxv2f32.nxv2f16(<vscale x 2 x half> %arg, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %vl)
-  %tmp2 = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> %tmp, <vscale x 2 x float> %tmp, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %tmp = call <vscale x 2 x float> @llvm.vp.fpext.nxv2f32.nxv2f16(<vscale x 2 x half> %arg, <vscale x 2 x i1> splat (i1 true), i32 %vl)
+  %tmp2 = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> %tmp, <vscale x 2 x float> %tmp, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x float> %tmp2
 }
 
@@ -48,9 +48,9 @@ define <vscale x 2 x float> @vfwadd_tu(<vscale x 2 x half> %arg, <vscale x 2 x f
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v9
 ; ZVFHMIN-NEXT:    ret
 bb:
-  %tmp = call <vscale x 2 x float> @llvm.vp.fpext.nxv2f32.nxv2f16(<vscale x 2 x half> %arg, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %arg2)
-  %tmp3 = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> %arg1, <vscale x 2 x float> %tmp, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %arg2)
-  %tmp4 = call <vscale x 2 x float> @llvm.vp.merge.nxv2f32(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x float> %tmp3, <vscale x 2 x float> %arg1, i32 %arg2)
+  %tmp = call <vscale x 2 x float> @llvm.vp.fpext.nxv2f32.nxv2f16(<vscale x 2 x half> %arg, <vscale x 2 x i1> splat (i1 true), i32 %arg2)
+  %tmp3 = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> %arg1, <vscale x 2 x float> %tmp, <vscale x 2 x i1> splat (i1 true), i32 %arg2)
+  %tmp4 = call <vscale x 2 x float> @llvm.vp.merge.nxv2f32(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x float> %tmp3, <vscale x 2 x float> %arg1, i32 %arg2)
   ret <vscale x 2 x float> %tmp4
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vmadd-sdnode.ll
index 07536407ace8..e1988c058fac 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmadd-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmadd-sdnode.ll
@@ -597,7 +597,7 @@ define <vscale x 4 x i32> @combine_mul_add_imm1(<vscale x 4 x i32> %a, <vscale x
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmadd.vv v8, v10, v10
 ; CHECK-NEXT:    ret
-  %x = add <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %x = add <vscale x 4 x i32> %a, splat (i32 1)
   %y = mul <vscale x 4 x i32> %x, %b
   ret <vscale x 4 x i32> %y
 }
@@ -608,7 +608,7 @@ define <vscale x 4 x i32> @combine_mul_add_imm1_2(<vscale x 4 x i32> %a, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmadd.vv v8, v10, v10
 ; CHECK-NEXT:    ret
-  %x = add <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %x = add <vscale x 4 x i32> %a, splat (i32 1)
   %y = mul <vscale x 4 x i32> %b, %x
   ret <vscale x 4 x i32> %y
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vnmsub-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vnmsub-sdnode.ll
index a3c896ecca22..186ffb64e590 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vnmsub-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vnmsub-sdnode.ll
@@ -597,7 +597,7 @@ define <vscale x 4 x i32> @combine_mul_sub_imm1(<vscale x 4 x i32> %a, <vscale x
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vnmsub.vv v8, v10, v10
 ; CHECK-NEXT:    ret
-  %x = sub <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %a
+  %x = sub <vscale x 4 x i32> splat (i32 1), %a
   %y = mul <vscale x 4 x i32> %x, %b
   ret <vscale x 4 x i32> %y
 }
@@ -608,7 +608,7 @@ define <vscale x 4 x i32> @combine_mul_sub_imm1_2(<vscale x 4 x i32> %a, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vnmsub.vv v8, v10, v10
 ; CHECK-NEXT:    ret
-  %x = sub <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %a
+  %x = sub <vscale x 4 x i32> splat (i32 1), %a
   %y = mul <vscale x 4 x i32> %b, %x
   ret <vscale x 4 x i32> %y
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
index 2546ec95a007..6d42b15273cf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
@@ -894,7 +894,7 @@ define half @vreduce_ord_fadd_nxv3f16(<vscale x 3 x half> %v, half %s) {
 ; CHECK-NEXT:    lui a2, 1048568
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a2
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
@@ -916,7 +916,7 @@ define half @vreduce_ord_fadd_nxv6f16(<vscale x 6 x half> %v, half %s) {
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v10, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v10, fa0
@@ -938,11 +938,11 @@ define half @vreduce_ord_fadd_nxv10f16(<vscale x 10 x half> %v, half %s) {
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v12, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
 ; CHECK-NEXT:    vmv.v.v v11, v12
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v11, v12, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v12, fa0
@@ -982,7 +982,7 @@ define half @vreduce_fadd_nxv3f16(<vscale x 3 x half> %v, half %s) {
 ; CHECK-NEXT:    lui a2, 1048568
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a2
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
@@ -1002,7 +1002,7 @@ define half @vreduce_fadd_nxv6f16(<vscale x 6 x half> %v, half %s) {
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v10, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v10, fa0
@@ -1025,11 +1025,11 @@ define half @vreduce_fmin_nxv10f16(<vscale x 10 x half> %v) {
 ; CHECK-NEXT:    vlse16.v v12, (a1), zero
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v12, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
 ; CHECK-NEXT:    vmv.v.v v11, v12
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v11, v12, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
diff --git a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
index b8a091b24259..16abf2bd28ac 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
@@ -66,7 +66,7 @@ define <vscale x 1 x i8> @vror_vi_nxv1i8(<vscale x 1 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 1 x i8> @llvm.fshr.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %a, <vscale x 1 x i8> shufflevector(<vscale x 1 x i8> insertelement(<vscale x 1 x i8> poison, i8 1, i32 0), <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer))
+  %x = call <vscale x 1 x i8> @llvm.fshr.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %a, <vscale x 1 x i8> splat (i8 1))
   ret <vscale x 1 x i8> %x
 }
 
@@ -84,7 +84,7 @@ define <vscale x 1 x i8> @vror_vi_rotl_nxv1i8(<vscale x 1 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 1 x i8> @llvm.fshl.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %a, <vscale x 1 x i8> shufflevector(<vscale x 1 x i8> insertelement(<vscale x 1 x i8> poison, i8 1, i32 0), <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer))
+  %x = call <vscale x 1 x i8> @llvm.fshl.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %a, <vscale x 1 x i8> splat (i8 1))
   ret <vscale x 1 x i8> %x
 }
 
@@ -150,7 +150,7 @@ define <vscale x 2 x i8> @vror_vi_nxv2i8(<vscale x 2 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 2 x i8> @llvm.fshr.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %a, <vscale x 2 x i8> shufflevector(<vscale x 2 x i8> insertelement(<vscale x 2 x i8> poison, i8 1, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer))
+  %x = call <vscale x 2 x i8> @llvm.fshr.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %a, <vscale x 2 x i8> splat (i8 1))
   ret <vscale x 2 x i8> %x
 }
 
@@ -168,7 +168,7 @@ define <vscale x 2 x i8> @vror_vi_rotl_nxv2i8(<vscale x 2 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 2 x i8> @llvm.fshl.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %a, <vscale x 2 x i8> shufflevector(<vscale x 2 x i8> insertelement(<vscale x 2 x i8> poison, i8 1, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer))
+  %x = call <vscale x 2 x i8> @llvm.fshl.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %a, <vscale x 2 x i8> splat (i8 1))
   ret <vscale x 2 x i8> %x
 }
 
@@ -234,7 +234,7 @@ define <vscale x 4 x i8> @vror_vi_nxv4i8(<vscale x 4 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 4 x i8> @llvm.fshr.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %a, <vscale x 4 x i8> shufflevector(<vscale x 4 x i8> insertelement(<vscale x 4 x i8> poison, i8 1, i32 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer))
+  %x = call <vscale x 4 x i8> @llvm.fshr.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %a, <vscale x 4 x i8> splat (i8 1))
   ret <vscale x 4 x i8> %x
 }
 
@@ -252,7 +252,7 @@ define <vscale x 4 x i8> @vror_vi_rotl_nxv4i8(<vscale x 4 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 4 x i8> @llvm.fshl.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %a, <vscale x 4 x i8> shufflevector(<vscale x 4 x i8> insertelement(<vscale x 4 x i8> poison, i8 1, i32 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer))
+  %x = call <vscale x 4 x i8> @llvm.fshl.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %a, <vscale x 4 x i8> splat (i8 1))
   ret <vscale x 4 x i8> %x
 }
 
@@ -318,7 +318,7 @@ define <vscale x 8 x i8> @vror_vi_nxv8i8(<vscale x 8 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 8 x i8> @llvm.fshr.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %a, <vscale x 8 x i8> shufflevector(<vscale x 8 x i8> insertelement(<vscale x 8 x i8> poison, i8 1, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer))
+  %x = call <vscale x 8 x i8> @llvm.fshr.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %a, <vscale x 8 x i8> splat (i8 1))
   ret <vscale x 8 x i8> %x
 }
 
@@ -336,7 +336,7 @@ define <vscale x 8 x i8> @vror_vi_rotl_nxv8i8(<vscale x 8 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 8 x i8> @llvm.fshl.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %a, <vscale x 8 x i8> shufflevector(<vscale x 8 x i8> insertelement(<vscale x 8 x i8> poison, i8 1, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer))
+  %x = call <vscale x 8 x i8> @llvm.fshl.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %a, <vscale x 8 x i8> splat (i8 1))
   ret <vscale x 8 x i8> %x
 }
 
@@ -402,7 +402,7 @@ define <vscale x 16 x i8> @vror_vi_nxv16i8(<vscale x 16 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> shufflevector(<vscale x 16 x i8> insertelement(<vscale x 16 x i8> poison, i8 1, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer))
+  %x = call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> splat (i8 1))
   ret <vscale x 16 x i8> %x
 }
 
@@ -420,7 +420,7 @@ define <vscale x 16 x i8> @vror_vi_rotl_nxv16i8(<vscale x 16 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> shufflevector(<vscale x 16 x i8> insertelement(<vscale x 16 x i8> poison, i8 1, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer))
+  %x = call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> splat (i8 1))
   ret <vscale x 16 x i8> %x
 }
 
@@ -486,7 +486,7 @@ define <vscale x 32 x i8> @vror_vi_nxv32i8(<vscale x 32 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 32 x i8> @llvm.fshr.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %a, <vscale x 32 x i8> shufflevector(<vscale x 32 x i8> insertelement(<vscale x 32 x i8> poison, i8 1, i32 0), <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer))
+  %x = call <vscale x 32 x i8> @llvm.fshr.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %a, <vscale x 32 x i8> splat (i8 1))
   ret <vscale x 32 x i8> %x
 }
 
@@ -504,7 +504,7 @@ define <vscale x 32 x i8> @vror_vi_rotl_nxv32i8(<vscale x 32 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 32 x i8> @llvm.fshl.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %a, <vscale x 32 x i8> shufflevector(<vscale x 32 x i8> insertelement(<vscale x 32 x i8> poison, i8 1, i32 0), <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer))
+  %x = call <vscale x 32 x i8> @llvm.fshl.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %a, <vscale x 32 x i8> splat (i8 1))
   ret <vscale x 32 x i8> %x
 }
 
@@ -570,7 +570,7 @@ define <vscale x 64 x i8> @vror_vi_nxv64i8(<vscale x 64 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 64 x i8> @llvm.fshr.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %a, <vscale x 64 x i8> shufflevector(<vscale x 64 x i8> insertelement(<vscale x 64 x i8> poison, i8 1, i32 0), <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer))
+  %x = call <vscale x 64 x i8> @llvm.fshr.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %a, <vscale x 64 x i8> splat (i8 1))
   ret <vscale x 64 x i8> %x
 }
 
@@ -588,7 +588,7 @@ define <vscale x 64 x i8> @vror_vi_rotl_nxv64i8(<vscale x 64 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 64 x i8> @llvm.fshl.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %a, <vscale x 64 x i8> shufflevector(<vscale x 64 x i8> insertelement(<vscale x 64 x i8> poison, i8 1, i32 0), <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer))
+  %x = call <vscale x 64 x i8> @llvm.fshl.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %a, <vscale x 64 x i8> splat (i8 1))
   ret <vscale x 64 x i8> %x
 }
 
@@ -654,7 +654,7 @@ define <vscale x 1 x i16> @vror_vi_nxv1i16(<vscale x 1 x i16> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 1 x i16> @llvm.fshr.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %a, <vscale x 1 x i16> shufflevector(<vscale x 1 x i16> insertelement(<vscale x 1 x i16> poison, i16 1, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer))
+  %x = call <vscale x 1 x i16> @llvm.fshr.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %a, <vscale x 1 x i16> splat (i16 1))
   ret <vscale x 1 x i16> %x
 }
 
@@ -672,7 +672,7 @@ define <vscale x 1 x i16> @vror_vi_rotl_nxv1i16(<vscale x 1 x i16> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 15
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 1 x i16> @llvm.fshl.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %a, <vscale x 1 x i16> shufflevector(<vscale x 1 x i16> insertelement(<vscale x 1 x i16> poison, i16 1, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer))
+  %x = call <vscale x 1 x i16> @llvm.fshl.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %a, <vscale x 1 x i16> splat (i16 1))
   ret <vscale x 1 x i16> %x
 }
 
@@ -738,7 +738,7 @@ define <vscale x 2 x i16> @vror_vi_nxv2i16(<vscale x 2 x i16> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 2 x i16> @llvm.fshr.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %a, <vscale x 2 x i16> shufflevector(<vscale x 2 x i16> insertelement(<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer))
+  %x = call <vscale x 2 x i16> @llvm.fshr.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %a, <vscale x 2 x i16> splat (i16 1))
   ret <vscale x 2 x i16> %x
 }
 
@@ -756,7 +756,7 @@ define <vscale x 2 x i16> @vror_vi_rotl_nxv2i16(<vscale x 2 x i16> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 15
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 2 x i16> @llvm.fshl.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %a, <vscale x 2 x i16> shufflevector(<vscale x 2 x i16> insertelement(<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer))
+  %x = call <vscale x 2 x i16> @llvm.fshl.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %a, <vscale x 2 x i16> splat (i16 1))
   ret <vscale x 2 x i16> %x
 }
 
@@ -822,7 +822,7 @@ define <vscale x 4 x i16> @vror_vi_nxv4i16(<vscale x 4 x i16> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 4 x i16> @llvm.fshr.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %a, <vscale x 4 x i16> shufflevector(<vscale x 4 x i16> insertelement(<vscale x 4 x i16> poison, i16 1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer))
+  %x = call <vscale x 4 x i16> @llvm.fshr.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %a, <vscale x 4 x i16> splat (i16 1))
   ret <vscale x 4 x i16> %x
 }
 
@@ -840,7 +840,7 @@ define <vscale x 4 x i16> @vror_vi_rotl_nxv4i16(<vscale x 4 x i16> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 15
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 4 x i16> @llvm.fshl.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %a, <vscale x 4 x i16> shufflevector(<vscale x 4 x i16> insertelement(<vscale x 4 x i16> poison, i16 1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer))
+  %x = call <vscale x 4 x i16> @llvm.fshl.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %a, <vscale x 4 x i16> splat (i16 1))
   ret <vscale x 4 x i16> %x
 }
 
@@ -906,7 +906,7 @@ define <vscale x 8 x i16> @vror_vi_nxv8i16(<vscale x 8 x i16> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 8 x i16> shufflevector(<vscale x 8 x i16> insertelement(<vscale x 8 x i16> poison, i16 1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer))
+  %x = call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 8 x i16> splat (i16 1))
   ret <vscale x 8 x i16> %x
 }
 
@@ -924,7 +924,7 @@ define <vscale x 8 x i16> @vror_vi_rotl_nxv8i16(<vscale x 8 x i16> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 15
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 8 x i16> shufflevector(<vscale x 8 x i16> insertelement(<vscale x 8 x i16> poison, i16 1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer))
+  %x = call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 8 x i16> splat (i16 1))
   ret <vscale x 8 x i16> %x
 }
 
@@ -990,7 +990,7 @@ define <vscale x 16 x i16> @vror_vi_nxv16i16(<vscale x 16 x i16> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 16 x i16> @llvm.fshr.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %a, <vscale x 16 x i16> shufflevector(<vscale x 16 x i16> insertelement(<vscale x 16 x i16> poison, i16 1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer))
+  %x = call <vscale x 16 x i16> @llvm.fshr.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %a, <vscale x 16 x i16> splat (i16 1))
   ret <vscale x 16 x i16> %x
 }
 
@@ -1008,7 +1008,7 @@ define <vscale x 16 x i16> @vror_vi_rotl_nxv16i16(<vscale x 16 x i16> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 15
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 16 x i16> @llvm.fshl.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %a, <vscale x 16 x i16> shufflevector(<vscale x 16 x i16> insertelement(<vscale x 16 x i16> poison, i16 1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer))
+  %x = call <vscale x 16 x i16> @llvm.fshl.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %a, <vscale x 16 x i16> splat (i16 1))
   ret <vscale x 16 x i16> %x
 }
 
@@ -1074,7 +1074,7 @@ define <vscale x 32 x i16> @vror_vi_nxv32i16(<vscale x 32 x i16> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 32 x i16> @llvm.fshr.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %a, <vscale x 32 x i16> shufflevector(<vscale x 32 x i16> insertelement(<vscale x 32 x i16> poison, i16 1, i32 0), <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer))
+  %x = call <vscale x 32 x i16> @llvm.fshr.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %a, <vscale x 32 x i16> splat (i16 1))
   ret <vscale x 32 x i16> %x
 }
 
@@ -1092,7 +1092,7 @@ define <vscale x 32 x i16> @vror_vi_rotl_nxv32i16(<vscale x 32 x i16> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 15
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 32 x i16> @llvm.fshl.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %a, <vscale x 32 x i16> shufflevector(<vscale x 32 x i16> insertelement(<vscale x 32 x i16> poison, i16 1, i32 0), <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer))
+  %x = call <vscale x 32 x i16> @llvm.fshl.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %a, <vscale x 32 x i16> splat (i16 1))
   ret <vscale x 32 x i16> %x
 }
 
@@ -1171,7 +1171,7 @@ define <vscale x 1 x i32> @vror_vi_nxv1i32(<vscale x 1 x i32> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 1 x i32> @llvm.fshr.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %a, <vscale x 1 x i32> shufflevector(<vscale x 1 x i32> insertelement(<vscale x 1 x i32> poison, i32 1, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer))
+  %x = call <vscale x 1 x i32> @llvm.fshr.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %a, <vscale x 1 x i32> splat (i32 1))
   ret <vscale x 1 x i32> %x
 }
 
@@ -1189,7 +1189,7 @@ define <vscale x 1 x i32> @vror_vi_rotl_nxv1i32(<vscale x 1 x i32> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 31
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 1 x i32> @llvm.fshl.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %a, <vscale x 1 x i32> shufflevector(<vscale x 1 x i32> insertelement(<vscale x 1 x i32> poison, i32 1, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer))
+  %x = call <vscale x 1 x i32> @llvm.fshl.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %a, <vscale x 1 x i32> splat (i32 1))
   ret <vscale x 1 x i32> %x
 }
 
@@ -1268,7 +1268,7 @@ define <vscale x 2 x i32> @vror_vi_nxv2i32(<vscale x 2 x i32> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 2 x i32> @llvm.fshr.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %a, <vscale x 2 x i32> shufflevector(<vscale x 2 x i32> insertelement(<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer))
+  %x = call <vscale x 2 x i32> @llvm.fshr.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %a, <vscale x 2 x i32> splat (i32 1))
   ret <vscale x 2 x i32> %x
 }
 
@@ -1286,7 +1286,7 @@ define <vscale x 2 x i32> @vror_vi_rotl_nxv2i32(<vscale x 2 x i32> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 31
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 2 x i32> @llvm.fshl.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %a, <vscale x 2 x i32> shufflevector(<vscale x 2 x i32> insertelement(<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer))
+  %x = call <vscale x 2 x i32> @llvm.fshl.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %a, <vscale x 2 x i32> splat (i32 1))
   ret <vscale x 2 x i32> %x
 }
 
@@ -1365,7 +1365,7 @@ define <vscale x 4 x i32> @vror_vi_nxv4i32(<vscale x 4 x i32> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector(<vscale x 4 x i32> insertelement(<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %x = call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 1))
   ret <vscale x 4 x i32> %x
 }
 
@@ -1383,7 +1383,7 @@ define <vscale x 4 x i32> @vror_vi_rotl_nxv4i32(<vscale x 4 x i32> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 31
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector(<vscale x 4 x i32> insertelement(<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %x = call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 1))
   ret <vscale x 4 x i32> %x
 }
 
@@ -1462,7 +1462,7 @@ define <vscale x 8 x i32> @vror_vi_nxv8i32(<vscale x 8 x i32> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 8 x i32> @llvm.fshr.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %a, <vscale x 8 x i32> shufflevector(<vscale x 8 x i32> insertelement(<vscale x 8 x i32> poison, i32 1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer))
+  %x = call <vscale x 8 x i32> @llvm.fshr.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %a, <vscale x 8 x i32> splat (i32 1))
   ret <vscale x 8 x i32> %x
 }
 
@@ -1480,7 +1480,7 @@ define <vscale x 8 x i32> @vror_vi_rotl_nxv8i32(<vscale x 8 x i32> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 31
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 8 x i32> @llvm.fshl.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %a, <vscale x 8 x i32> shufflevector(<vscale x 8 x i32> insertelement(<vscale x 8 x i32> poison, i32 1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer))
+  %x = call <vscale x 8 x i32> @llvm.fshl.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %a, <vscale x 8 x i32> splat (i32 1))
   ret <vscale x 8 x i32> %x
 }
 
@@ -1559,7 +1559,7 @@ define <vscale x 16 x i32> @vror_vi_nxv16i32(<vscale x 16 x i32> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 16 x i32> @llvm.fshr.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %a, <vscale x 16 x i32> shufflevector(<vscale x 16 x i32> insertelement(<vscale x 16 x i32> poison, i32 1, i32 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer))
+  %x = call <vscale x 16 x i32> @llvm.fshr.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %a, <vscale x 16 x i32> splat (i32 1))
   ret <vscale x 16 x i32> %x
 }
 
@@ -1577,7 +1577,7 @@ define <vscale x 16 x i32> @vror_vi_rotl_nxv16i32(<vscale x 16 x i32> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 31
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 16 x i32> @llvm.fshl.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %a, <vscale x 16 x i32> shufflevector(<vscale x 16 x i32> insertelement(<vscale x 16 x i32> poison, i32 1, i32 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer))
+  %x = call <vscale x 16 x i32> @llvm.fshl.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %a, <vscale x 16 x i32> splat (i32 1))
   ret <vscale x 16 x i32> %x
 }
 
@@ -1657,7 +1657,7 @@ define <vscale x 1 x i64> @vror_vi_nxv1i64(<vscale x 1 x i64> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 1 x i64> @llvm.fshr.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %a, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer))
+  %x = call <vscale x 1 x i64> @llvm.fshr.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %a, <vscale x 1 x i64> splat (i64 1))
   ret <vscale x 1 x i64> %x
 }
 
@@ -1676,7 +1676,7 @@ define <vscale x 1 x i64> @vror_vi_rotl_nxv1i64(<vscale x 1 x i64> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 63
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 1 x i64> @llvm.fshl.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %a, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer))
+  %x = call <vscale x 1 x i64> @llvm.fshl.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %a, <vscale x 1 x i64> splat (i64 1))
   ret <vscale x 1 x i64> %x
 }
 
@@ -1756,7 +1756,7 @@ define <vscale x 2 x i64> @vror_vi_nxv2i64(<vscale x 2 x i64> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer))
+  %x = call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 1))
   ret <vscale x 2 x i64> %x
 }
 
@@ -1775,7 +1775,7 @@ define <vscale x 2 x i64> @vror_vi_rotl_nxv2i64(<vscale x 2 x i64> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 63
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer))
+  %x = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 1))
   ret <vscale x 2 x i64> %x
 }
 
@@ -1855,7 +1855,7 @@ define <vscale x 4 x i64> @vror_vi_nxv4i64(<vscale x 4 x i64> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 4 x i64> @llvm.fshr.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %a, <vscale x 4 x i64> shufflevector(<vscale x 4 x i64> insertelement(<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer))
+  %x = call <vscale x 4 x i64> @llvm.fshr.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %a, <vscale x 4 x i64> splat (i64 1))
   ret <vscale x 4 x i64> %x
 }
 
@@ -1874,7 +1874,7 @@ define <vscale x 4 x i64> @vror_vi_rotl_nxv4i64(<vscale x 4 x i64> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 63
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 4 x i64> @llvm.fshl.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %a, <vscale x 4 x i64> shufflevector(<vscale x 4 x i64> insertelement(<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer))
+  %x = call <vscale x 4 x i64> @llvm.fshl.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %a, <vscale x 4 x i64> splat (i64 1))
   ret <vscale x 4 x i64> %x
 }
 
@@ -1954,7 +1954,7 @@ define <vscale x 8 x i64> @vror_vi_nxv8i64(<vscale x 8 x i64> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 8 x i64> @llvm.fshr.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %a, <vscale x 8 x i64> shufflevector(<vscale x 8 x i64> insertelement(<vscale x 8 x i64> poison, i64 1, i32 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer))
+  %x = call <vscale x 8 x i64> @llvm.fshr.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %a, <vscale x 8 x i64> splat (i64 1))
   ret <vscale x 8 x i64> %x
 }
 
@@ -1973,6 +1973,6 @@ define <vscale x 8 x i64> @vror_vi_rotl_nxv8i64(<vscale x 8 x i64> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 63
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 8 x i64> @llvm.fshl.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %a, <vscale x 8 x i64> shufflevector(<vscale x 8 x i64> insertelement(<vscale x 8 x i64> poison, i64 1, i32 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer))
+  %x = call <vscale x 8 x i64> @llvm.fshl.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %a, <vscale x 8 x i64> splat (i64 1))
   ret <vscale x 8 x i64> %x
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
index 1b568bf8801b..0d52dd794fd5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
@@ -762,7 +762,7 @@ define <vscale x 2 x i1> @select_one(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y,
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmorn.mm v0, v8, v0
 ; CHECK-NEXT:    ret
-  %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i1> %a
 }
 
@@ -782,7 +782,7 @@ define <vscale x 2 x i1> @select_x_one(<vscale x 2 x i1> %x, <vscale x 2 x i1> %
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmorn.mm v0, v8, v0
 ; CHECK-NEXT:    ret
-  %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i1> %a
 }
 
@@ -802,7 +802,7 @@ define <vscale x 2 x i1> @select_one_x(<vscale x 2 x i1> %x, <vscale x 2 x i1> %
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmor.mm v0, v0, v8
 ; CHECK-NEXT:    ret
-  %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> %x, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %y, i32 %evl)
+  %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> %x, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i1> %y, i32 %evl)
  ret <vscale x 2 x i1> %a
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsext-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vsext-vp-mask.ll
index 66f9e8dc9c5f..04aed5d81db9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsext-vp-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsext-vp-mask.ll
@@ -22,7 +22,7 @@ define <vscale x 2 x i16> @vsext_nxv2i1_nxv2i16_unmasked(<vscale x 2 x i1> %a, i
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i16> %v
 }
 
@@ -46,7 +46,7 @@ define <vscale x 2 x i32> @vsext_nxv2i1_nxv2i32_unmasked(<vscale x 2 x i1> %a, i
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i32> %v
 }
 
@@ -70,6 +70,6 @@ define <vscale x 2 x i64> @vsext_nxv2i1_nxv2i64_unmasked(<vscale x 2 x i1> %a, i
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i64> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
index 8aaa74c8e21f..834e7dd85aea 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
@@ -22,7 +22,7 @@ define <vscale x 2 x i16> @vsext_nxv2i8_nxv2i16_unmasked(<vscale x 2 x i8> %a, i
 ; CHECK-NEXT:    vsext.vf2 v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i16> %v
 }
 
@@ -46,7 +46,7 @@ define <vscale x 2 x i32> @vsext_nxv2i8_nxv2i32_unmasked(<vscale x 2 x i8> %a, i
 ; CHECK-NEXT:    vsext.vf4 v9, v8
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i32> %v
 }
 
@@ -70,7 +70,7 @@ define <vscale x 2 x i64> @vsext_nxv2i8_nxv2i64_unmasked(<vscale x 2 x i8> %a, i
 ; CHECK-NEXT:    vsext.vf8 v10, v8
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i64> %v
 }
 
@@ -94,7 +94,7 @@ define <vscale x 2 x i32> @vsext_nxv2i16_nxv2i32_unmasked(<vscale x 2 x i16> %a,
 ; CHECK-NEXT:    vsext.vf2 v9, v8
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i32> %v
 }
 
@@ -118,7 +118,7 @@ define <vscale x 2 x i64> @vsext_nxv2i16_nxv2i64_unmasked(<vscale x 2 x i16> %a,
 ; CHECK-NEXT:    vsext.vf4 v10, v8
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i64> %v
 }
 
@@ -142,7 +142,7 @@ define <vscale x 2 x i64> @vsext_nxv2i32_nxv2i64_unmasked(<vscale x 2 x i32> %a,
 ; CHECK-NEXT:    vsext.vf2 v10, v8
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i64> %v
 }
 
@@ -195,6 +195,6 @@ define <vscale x 32 x i32> @vsext_nxv32i8_nxv32i32_unmasked(<vscale x 32 x i8> %
 ; CHECK-NEXT:    vsext.vf4 v24, v8
 ; CHECK-NEXT:    vmv.v.v v8, v24
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i1> shufflevector (<vscale x 32 x i1> insertelement (<vscale x 32 x i1> undef, i1 true, i32 0), <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 32 x i32> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp-mask.ll
index 43451f446b37..6e09ceefb729 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp-mask.ll
@@ -25,7 +25,7 @@ define <vscale x 2 x half> @vsitofp_nxv2f16_nxv2i1_unmasked(<vscale x 2 x i1> %v
 ; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x half> @llvm.vp.sitofp.nxv2f16.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x half> @llvm.vp.sitofp.nxv2f16.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
 
@@ -52,7 +52,7 @@ define <vscale x 2 x float> @vsitofp_nxv2f32_nxv2i1_unmasked(<vscale x 2 x i1> %
 ; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x float> %v
 }
 
@@ -79,6 +79,6 @@ define <vscale x 2 x double> @vsitofp_nxv2f64_nxv2i1_unmasked(<vscale x 2 x i1>
 ; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
index 62848ea2279a..016a43784733 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
@@ -69,7 +69,7 @@ define <vscale x 2 x half> @vsitofp_nxv2f16_nxv2i8_unmasked(<vscale x 2 x i8> %v
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x half> @llvm.vp.sitofp.nxv2f16.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x half> @llvm.vp.sitofp.nxv2f16.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
 
@@ -107,7 +107,7 @@ define <vscale x 2 x half> @vsitofp_nxv2f16_nxv2i16_unmasked(<vscale x 2 x i16>
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x half> @llvm.vp.sitofp.nxv2f16.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x half> @llvm.vp.sitofp.nxv2f16.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
 
@@ -147,7 +147,7 @@ define <vscale x 2 x half> @vsitofp_nxv2f16_nxv2i32_unmasked(<vscale x 2 x i32>
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x half> @llvm.vp.sitofp.nxv2f16.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x half> @llvm.vp.sitofp.nxv2f16.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
 
@@ -189,7 +189,7 @@ define <vscale x 2 x half> @vsitofp_nxv2f16_nxv2i64_unmasked(<vscale x 2 x i64>
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x half> @llvm.vp.sitofp.nxv2f16.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x half> @llvm.vp.sitofp.nxv2f16.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
 
@@ -213,7 +213,7 @@ define <vscale x 2 x float> @vsitofp_nxv2f32_nxv2i8_unmasked(<vscale x 2 x i8> %
 ; CHECK-NEXT:    vsext.vf2 v9, v8
 ; CHECK-NEXT:    vfwcvt.f.x.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x float> %v
 }
 
@@ -237,7 +237,7 @@ define <vscale x 2 x float> @vsitofp_nxv2f32_nxv2i16_unmasked(<vscale x 2 x i16>
 ; CHECK-NEXT:    vfwcvt.f.x.v v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x float> %v
 }
 
@@ -259,7 +259,7 @@ define <vscale x 2 x float> @vsitofp_nxv2f32_nxv2i32_unmasked(<vscale x 2 x i32>
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x float> %v
 }
 
@@ -283,7 +283,7 @@ define <vscale x 2 x float> @vsitofp_nxv2f32_nxv2i64_unmasked(<vscale x 2 x i64>
 ; CHECK-NEXT:    vfncvt.f.x.w v10, v8
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x float> %v
 }
 
@@ -307,7 +307,7 @@ define <vscale x 2 x double> @vsitofp_nxv2f64_nxv2i8_unmasked(<vscale x 2 x i8>
 ; CHECK-NEXT:    vsext.vf4 v10, v8
 ; CHECK-NEXT:    vfwcvt.f.x.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
 }
 
@@ -331,7 +331,7 @@ define <vscale x 2 x double> @vsitofp_nxv2f64_nxv2i16_unmasked(<vscale x 2 x i16
 ; CHECK-NEXT:    vsext.vf2 v10, v8
 ; CHECK-NEXT:    vfwcvt.f.x.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
 }
 
@@ -355,7 +355,7 @@ define <vscale x 2 x double> @vsitofp_nxv2f64_nxv2i32_unmasked(<vscale x 2 x i32
 ; CHECK-NEXT:    vfwcvt.f.x.v v10, v8
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
 }
 
@@ -377,7 +377,7 @@ define <vscale x 2 x double> @vsitofp_nxv2f64_nxv2i64_unmasked(<vscale x 2 x i64
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
 }
 
@@ -486,6 +486,6 @@ define <vscale x 32 x float> @vsitofp_nxv32f32_nxv32i32_unmasked(<vscale x 32 x
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> shufflevector (<vscale x 32 x i1> insertelement (<vscale x 32 x i1> undef, i1 true, i32 0), <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x float> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp-mask.ll
index 4eb80e36001a..ad8097631acd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp-mask.ll
@@ -22,7 +22,7 @@ define <vscale x 2 x i1> @vtrunc_nxv2i1_nxv2i16_unmasked(<vscale x 2 x i16> %a,
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i1> %v
 }
 
@@ -46,7 +46,7 @@ define <vscale x 2 x i1> @vtrunc_nxv2i1_nxv2i32_unmasked(<vscale x 2 x i32> %a,
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i1> %v
 }
 
@@ -71,6 +71,6 @@ define <vscale x 2 x i1> @vtrunc_nxv2i1_nxv2i64_unmasked(<vscale x 2 x i64> %a,
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i1> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
index a624a42b3873..a7b4d6616b7b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
@@ -44,7 +44,7 @@ define <vscale x 2 x i8> @vtrunc_nxv2i8_nxv2i16_unmasked(<vscale x 2 x i16> %a,
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i8> %v
 }
 
@@ -70,7 +70,7 @@ define <vscale x 2 x i8> @vtrunc_nxv2i8_nxv2i32_unmasked(<vscale x 2 x i32> %a,
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i8> %v
 }
 
@@ -100,7 +100,7 @@ define <vscale x 2 x i8> @vtrunc_nxv2i8_nxv2i64_unmasked(<vscale x 2 x i64> %a,
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i8> %v
 }
 
@@ -122,7 +122,7 @@ define <vscale x 2 x i16> @vtrunc_nxv2i16_nxv2i32_unmasked(<vscale x 2 x i32> %a
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i16> %v
 }
 
@@ -148,7 +148,7 @@ define <vscale x 2 x i16> @vtrunc_nxv2i16_nxv2i64_unmasked(<vscale x 2 x i64> %a
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v10, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i16> %v
 }
 
@@ -205,7 +205,7 @@ define <vscale x 2 x i32> @vtrunc_nxv2i32_nxv2i64_unmasked(<vscale x 2 x i64> %a
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.trunc.nxv2i32.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i32> @llvm.vp.trunc.nxv2i32.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i32> %v
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp-mask.ll
index 128bb80971ac..cf4bb161ea75 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp-mask.ll
@@ -25,7 +25,7 @@ define <vscale x 2 x half> @vuitofp_nxv2f16_nxv2i1_unmasked(<vscale x 2 x i1> %v
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x half> @llvm.vp.uitofp.nxv2f16.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x half> @llvm.vp.uitofp.nxv2f16.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
 
@@ -52,7 +52,7 @@ define <vscale x 2 x float> @vuitofp_nxv2f32_nxv2i1_unmasked(<vscale x 2 x i1> %
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x float> %v
 }
 
@@ -79,6 +79,6 @@ define <vscale x 2 x double> @vuitofp_nxv2f64_nxv2i1_unmasked(<vscale x 2 x i1>
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
index 8ca27484d69f..668d9373b81d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
@@ -69,7 +69,7 @@ define <vscale x 2 x half> @vuitofp_nxv2f16_nxv2i8_unmasked(<vscale x 2 x i8> %v
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x half> @llvm.vp.uitofp.nxv2f16.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x half> @llvm.vp.uitofp.nxv2f16.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
 
@@ -107,7 +107,7 @@ define <vscale x 2 x half> @vuitofp_nxv2f16_nxv2i16_unmasked(<vscale x 2 x i16>
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x half> @llvm.vp.uitofp.nxv2f16.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x half> @llvm.vp.uitofp.nxv2f16.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
 
@@ -147,7 +147,7 @@ define <vscale x 2 x half> @vuitofp_nxv2f16_nxv2i32_unmasked(<vscale x 2 x i32>
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x half> @llvm.vp.uitofp.nxv2f16.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x half> @llvm.vp.uitofp.nxv2f16.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
 
@@ -189,7 +189,7 @@ define <vscale x 2 x half> @vuitofp_nxv2f16_nxv2i64_unmasked(<vscale x 2 x i64>
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x half> @llvm.vp.uitofp.nxv2f16.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x half> @llvm.vp.uitofp.nxv2f16.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
 
@@ -213,7 +213,7 @@ define <vscale x 2 x float> @vuitofp_nxv2f32_nxv2i8_unmasked(<vscale x 2 x i8> %
 ; CHECK-NEXT:    vzext.vf2 v9, v8
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x float> %v
 }
 
@@ -237,7 +237,7 @@ define <vscale x 2 x float> @vuitofp_nxv2f32_nxv2i16_unmasked(<vscale x 2 x i16>
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x float> %v
 }
 
@@ -259,7 +259,7 @@ define <vscale x 2 x float> @vuitofp_nxv2f32_nxv2i32_unmasked(<vscale x 2 x i32>
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x float> %v
 }
 
@@ -283,7 +283,7 @@ define <vscale x 2 x float> @vuitofp_nxv2f32_nxv2i64_unmasked(<vscale x 2 x i64>
 ; CHECK-NEXT:    vfncvt.f.xu.w v10, v8
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x float> %v
 }
 
@@ -307,7 +307,7 @@ define <vscale x 2 x double> @vuitofp_nxv2f64_nxv2i8_unmasked(<vscale x 2 x i8>
 ; CHECK-NEXT:    vzext.vf4 v10, v8
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
 }
 
@@ -331,7 +331,7 @@ define <vscale x 2 x double> @vuitofp_nxv2f64_nxv2i16_unmasked(<vscale x 2 x i16
 ; CHECK-NEXT:    vzext.vf2 v10, v8
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
 }
 
@@ -355,7 +355,7 @@ define <vscale x 2 x double> @vuitofp_nxv2f64_nxv2i32_unmasked(<vscale x 2 x i32
 ; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
 }
 
@@ -377,7 +377,7 @@ define <vscale x 2 x double> @vuitofp_nxv2f64_nxv2i64_unmasked(<vscale x 2 x i64
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
 }
 
@@ -486,6 +486,6 @@ define <vscale x 32 x float> @vuitofp_nxv32f32_nxv32i32_unmasked(<vscale x 32 x
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> shufflevector (<vscale x 32 x i1> insertelement (<vscale x 32 x i1> undef, i1 true, i32 0), <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x float> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwadd-mask-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwadd-mask-sdnode.ll
index ad7ad991e082..02af09f028fc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwadd-mask-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwadd-mask-sdnode.ll
@@ -12,7 +12,7 @@ define <vscale x 8 x i64> @vwadd_wv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x
 ; CHECK-NEXT:    vwadd.wv v16, v16, v8, v0.t
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
-    %mask = icmp slt <vscale x 8 x i32> %x, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 42, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+    %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
     %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> zeroinitializer
     %sa = sext <vscale x 8 x i32> %a to <vscale x 8 x i64>
     %ret = add <vscale x 8 x i64> %sa, %y
@@ -29,7 +29,7 @@ define <vscale x 8 x i64> @vwaddu_wv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x
 ; CHECK-NEXT:    vwaddu.wv v16, v16, v8, v0.t
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
-    %mask = icmp slt <vscale x 8 x i32> %x, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 42, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+    %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
     %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> zeroinitializer
     %sa = zext <vscale x 8 x i32> %a to <vscale x 8 x i64>
     %ret = add <vscale x 8 x i64> %sa, %y
@@ -47,7 +47,7 @@ define <vscale x 8 x i64> @vwaddu_vv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x
 ; CHECK-NEXT:    vwaddu.vv v16, v8, v12
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
-    %mask = icmp slt <vscale x 8 x i32> %x, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 42, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+    %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
     %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> zeroinitializer
     %sa = zext <vscale x 8 x i32> %a to <vscale x 8 x i64>
     %sy = zext <vscale x 8 x i32> %y to <vscale x 8 x i64>
@@ -65,7 +65,7 @@ define <vscale x 8 x i64> @vwadd_wv_mask_v8i32_commutative(<vscale x 8 x i32> %x
 ; CHECK-NEXT:    vwadd.wv v16, v16, v8, v0.t
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
-    %mask = icmp slt <vscale x 8 x i32> %x, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 42, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+    %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
     %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> zeroinitializer
     %sa = sext <vscale x 8 x i32> %a to <vscale x 8 x i64>
     %ret = add <vscale x 8 x i64> %y, %sa
@@ -82,8 +82,8 @@ define <vscale x 8 x i64> @vwadd_wv_mask_v8i32_nonzero(<vscale x 8 x i32> %x, <v
 ; CHECK-NEXT:    vmerge.vvm v24, v12, v8, v0
 ; CHECK-NEXT:    vwadd.wv v8, v16, v24
 ; CHECK-NEXT:    ret
-    %mask = icmp slt <vscale x 8 x i32> %x, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 42, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
-    %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+    %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
+    %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> splat (i32 1)
     %sa = sext <vscale x 8 x i32> %a to <vscale x 8 x i64>
     %ret = add <vscale x 8 x i64> %sa, %y
     ret <vscale x 8 x i64> %ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vwadd-vp.ll
index c3ffee6969d7..a0b7726d3cb5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwadd-vp.ll
@@ -13,9 +13,9 @@ define <vscale x 2 x i32> @vwadd_tu(<vscale x 2 x i8> %arg, <vscale x 2 x i32> %
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 bb:
-  %tmp = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i8(<vscale x 2 x i8> %arg, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %arg2)
-  %tmp3 = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %arg1, <vscale x 2 x i32> %tmp, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %arg2)
-  %tmp4 = call <vscale x 2 x i32> @llvm.vp.merge.nxv2i32(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> %tmp3, <vscale x 2 x i32> %arg1, i32 %arg2)
+  %tmp = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i8(<vscale x 2 x i8> %arg, <vscale x 2 x i1> splat (i1 true), i32 %arg2)
+  %tmp3 = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %arg1, <vscale x 2 x i32> %tmp, <vscale x 2 x i1> splat (i1 true), i32 %arg2)
+  %tmp4 = call <vscale x 2 x i32> @llvm.vp.merge.nxv2i32(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i32> %tmp3, <vscale x 2 x i32> %arg1, i32 %arg2)
   ret <vscale x 2 x i32> %tmp4
 }
 
@@ -31,9 +31,9 @@ define <vscale x 2 x i32> @vwaddu_tu(<vscale x 2 x i8> %arg, <vscale x 2 x i32>
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 bb:
-  %tmp = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> %arg, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %arg2)
-  %tmp3 = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %arg1, <vscale x 2 x i32> %tmp, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %arg2)
-  %tmp4 = call <vscale x 2 x i32> @llvm.vp.merge.nxv2i32(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> %tmp3, <vscale x 2 x i32> %arg1, i32 %arg2)
+  %tmp = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> %arg, <vscale x 2 x i1> splat (i1 true), i32 %arg2)
+  %tmp3 = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %arg1, <vscale x 2 x i32> %tmp, <vscale x 2 x i1> splat (i1 true), i32 %arg2)
+  %tmp4 = call <vscale x 2 x i32> @llvm.vp.merge.nxv2i32(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i32> %tmp3, <vscale x 2 x i32> %arg1, i32 %arg2)
   ret <vscale x 2 x i32> %tmp4
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
index 8a0af38f724c..770bb566c764 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
@@ -236,7 +236,7 @@ define <vscale x 2 x i64> @vwsll_vi_nxv2i64(<vscale x 2 x i32> %a) {
 ; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 2 x i32> %a to <vscale x 2 x i64>
-  %z = shl <vscale x 2 x i64> %x, shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 2, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %z = shl <vscale x 2 x i64> %x, splat (i64 2)
   ret <vscale x 2 x i64> %z
 }
 
@@ -444,7 +444,7 @@ define <vscale x 4 x i32> @vwsll_vi_nxv4i32(<vscale x 4 x i16> %a) {
 ; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 4 x i16> %a to <vscale x 4 x i32>
-  %z = shl <vscale x 4 x i32> %x, shufflevector(<vscale x 4 x i32> insertelement(<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %z = shl <vscale x 4 x i32> %x, splat (i32 2)
   ret <vscale x 4 x i32> %z
 }
 
@@ -624,6 +624,6 @@ define <vscale x 8 x i16> @vwsll_vi_nxv8i16(<vscale x 8 x i8> %a) {
 ; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 8 x i8> %a to <vscale x 8 x i16>
-  %z = shl <vscale x 8 x i16> %x, shufflevector(<vscale x 8 x i16> insertelement(<vscale x 8 x i16> poison, i16 2, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %z = shl <vscale x 8 x i16> %x, splat (i16 2)
   ret <vscale x 8 x i16> %z
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsll-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vwsll-vp.ll
index e7ac8ee17564..bb3076b3a945 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwsll-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwsll-vp.ll
@@ -263,7 +263,7 @@ define <vscale x 2 x i64> @vwsll_vi_nxv2i64(<vscale x 2 x i32> %a, <vscale x 2 x
 ; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 2 x i32> %a to <vscale x 2 x i64>
-  %z = call <vscale x 2 x i64> @llvm.vp.shl.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 2, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
+  %z = call <vscale x 2 x i64> @llvm.vp.shl.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> splat (i64 2), <vscale x 2 x i1> %m, i32 %vl)
   ret <vscale x 2 x i64> %z
 }
 
@@ -497,7 +497,7 @@ define <vscale x 4 x i32> @vwsll_vi_nxv4i32(<vscale x 4 x i16> %a, <vscale x 4 x
 ; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 4 x i16> %a to <vscale x 4 x i32>
-  %z = call <vscale x 4 x i32> @llvm.vp.shl.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> shufflevector(<vscale x 4 x i32> insertelement(<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %m, i32 %vl)
+  %z = call <vscale x 4 x i32> @llvm.vp.shl.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> splat (i32 2), <vscale x 4 x i1> %m, i32 %vl)
   ret <vscale x 4 x i32> %z
 }
 
@@ -703,6 +703,6 @@ define <vscale x 8 x i16> @vwsll_vi_nxv8i16(<vscale x 8 x i8> %a, <vscale x 8 x
 ; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 8 x i8> %a to <vscale x 8 x i16>
-  %z = call <vscale x 8 x i16> @llvm.vp.shl.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> shufflevector(<vscale x 8 x i16> insertelement(<vscale x 8 x i16> poison, i16 2, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %m, i32 %vl)
+  %z = call <vscale x 8 x i16> @llvm.vp.shl.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> splat (i16 2), <vscale x 8 x i1> %m, i32 %vl)
   ret <vscale x 8 x i16> %z
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsub-mask-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwsub-mask-sdnode.ll
index 0cc0063c1d41..04ece9d94880 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwsub-mask-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwsub-mask-sdnode.ll
@@ -12,7 +12,7 @@ define <vscale x 8 x i64> @vwsub_wv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x
 ; CHECK-NEXT:    vwsub.wv v16, v16, v8, v0.t
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
-    %mask = icmp slt <vscale x 8 x i32> %x, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 42, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+    %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
     %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> zeroinitializer
     %sa = sext <vscale x 8 x i32> %a to <vscale x 8 x i64>
     %ret = sub <vscale x 8 x i64> %y, %sa
@@ -29,7 +29,7 @@ define <vscale x 8 x i64> @vwsubu_wv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x
 ; CHECK-NEXT:    vwsubu.wv v16, v16, v8, v0.t
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
-    %mask = icmp slt <vscale x 8 x i32> %x, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 42, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+    %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
     %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> zeroinitializer
     %sa = zext <vscale x 8 x i32> %a to <vscale x 8 x i64>
     %ret = sub <vscale x 8 x i64> %y, %sa
@@ -47,7 +47,7 @@ define <vscale x 8 x i64> @vwsubu_vv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x
 ; CHECK-NEXT:    vwsubu.vv v16, v12, v8
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
-    %mask = icmp slt <vscale x 8 x i32> %x, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 42, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+    %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
     %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> zeroinitializer
     %sa = zext <vscale x 8 x i32> %a to <vscale x 8 x i64>
     %sy = zext <vscale x 8 x i32> %y to <vscale x 8 x i64>
@@ -65,8 +65,8 @@ define <vscale x 8 x i64> @vwsub_wv_mask_v8i32_nonzero(<vscale x 8 x i32> %x, <v
 ; CHECK-NEXT:    vmerge.vvm v24, v12, v8, v0
 ; CHECK-NEXT:    vwsub.wv v8, v16, v24
 ; CHECK-NEXT:    ret
-    %mask = icmp slt <vscale x 8 x i32> %x, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 42, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
-    %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+    %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
+    %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> splat (i32 1)
     %sa = sext <vscale x 8 x i32> %a to <vscale x 8 x i64>
     %ret = sub <vscale x 8 x i64> %y, %sa
     ret <vscale x 8 x i64> %ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vzext-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vzext-vp-mask.ll
index 41668d806ec7..e14236c0258c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vzext-vp-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vzext-vp-mask.ll
@@ -22,7 +22,7 @@ define <vscale x 2 x i16> @vzext_nxv2i1_nxv2i16_unmasked(<vscale x 2 x i1> %a, i
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i16> %v
 }
 
@@ -46,7 +46,7 @@ define <vscale x 2 x i32> @vzext_nxv2i1_nxv2i32_unmasked(<vscale x 2 x i1> %a, i
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i32> %v
 }
 
@@ -70,6 +70,6 @@ define <vscale x 2 x i64> @vzext_nxv2i1_nxv2i64_unmasked(<vscale x 2 x i1> %a, i
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i64> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
index 365c221c9b9f..400f89b1ef77 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
@@ -22,7 +22,7 @@ define <vscale x 2 x i16> @vzext_nxv2i8_nxv2i16_unmasked(<vscale x 2 x i8> %a, i
 ; CHECK-NEXT:    vzext.vf2 v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i16> %v
 }
 
@@ -46,7 +46,7 @@ define <vscale x 2 x i32> @vzext_nxv2i8_nxv2i32_unmasked(<vscale x 2 x i8> %a, i
 ; CHECK-NEXT:    vzext.vf4 v9, v8
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i32> %v
 }
 
@@ -70,7 +70,7 @@ define <vscale x 2 x i64> @vzext_nxv2i8_nxv2i64_unmasked(<vscale x 2 x i8> %a, i
 ; CHECK-NEXT:    vzext.vf8 v10, v8
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i64> %v
 }
 
@@ -94,7 +94,7 @@ define <vscale x 2 x i32> @vzext_nxv2i16_nxv2i32_unmasked(<vscale x 2 x i16> %a,
 ; CHECK-NEXT:    vzext.vf2 v9, v8
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i32> %v
 }
 
@@ -118,7 +118,7 @@ define <vscale x 2 x i64> @vzext_nxv2i16_nxv2i64_unmasked(<vscale x 2 x i16> %a,
 ; CHECK-NEXT:    vzext.vf4 v10, v8
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i64> %v
 }
 
@@ -142,7 +142,7 @@ define <vscale x 2 x i64> @vzext_nxv2i32_nxv2i64_unmasked(<vscale x 2 x i32> %a,
 ; CHECK-NEXT:    vzext.vf2 v10, v8
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i64> %v
 }
 
@@ -195,6 +195,6 @@ define <vscale x 32 x i32> @vzext_nxv32i8_nxv32i32_unmasked(<vscale x 32 x i8> %
 ; CHECK-NEXT:    vzext.vf4 v24, v8
 ; CHECK-NEXT:    vmv.v.v v8, v24
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i1> shufflevector (<vscale x 32 x i1> insertelement (<vscale x 32 x i1> undef, i1 true, i32 0), <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 32 x i32> %v
 }
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/vararr.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/vararr.ll
new file mode 100644
index 000000000000..897aab70852d
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/vararr.ll
@@ -0,0 +1,54 @@
+; Modified from: https://github.com/KhronosGroup/SPIRV-LLVM-Translator/test/extensions/INTEL/SPV_INTEL_variable_length_array/basic.ll
+
+; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-extensions=SPV_INTEL_variable_length_array %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown --spirv-extensions=SPV_INTEL_variable_length_array %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: array allocation: this instruction requires the following SPIR-V extension: SPV_INTEL_variable_length_array
+
+; CHECK-SPIRV: Capability VariableLengthArrayINTEL
+; CHECK-SPIRV: Extension "SPV_INTEL_variable_length_array"
+
+; CHECK-SPIRV-DAG: OpName %[[Len:.*]] "a"
+; CHECK-SPIRV-DAG: %[[Long:.*]] = OpTypeInt 64 0
+; CHECK-SPIRV-DAG: %[[Int:.*]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG: %[[Char:.*]] = OpTypeInt 8 0
+; CHECK-SPIRV-DAG: %[[CharPtr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[Char]]
+; CHECK-SPIRV-DAG: %[[IntPtr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[Int]]
+; CHECK-SPIRV: %[[Len]] = OpFunctionParameter %[[Long:.*]]
+; CHECK-SPIRV: %[[SavedMem1:.*]] = OpSaveMemoryINTEL %[[CharPtr]]
+; CHECK-SPIRV: OpVariableLengthArrayINTEL %[[IntPtr]] %[[Len]]
+; CHECK-SPIRV: OpRestoreMemoryINTEL %[[SavedMem1]]
+; CHECK-SPIRV: %[[SavedMem2:.*]] = OpSaveMemoryINTEL %[[CharPtr]]
+; CHECK-SPIRV: OpVariableLengthArrayINTEL %[[IntPtr]] %[[Len]]
+; CHECK-SPIRV: OpRestoreMemoryINTEL %[[SavedMem2]]
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir"
+
+define dso_local spir_func i32 @foo(i64 %a, i64 %b) {
+entry:
+  %vector1 = alloca [42 x i32], align 16
+  call void @llvm.lifetime.start.p0(i64 168, ptr nonnull %vector1)
+  %stack1 = call ptr @llvm.stacksave.p0()
+  %vla = alloca i32, i64 %a, align 16
+  %arrayidx = getelementptr inbounds i32, ptr %vla, i64 %b
+  %elem1 = load i32, ptr %arrayidx, align 4
+  call void @llvm.stackrestore.p0(ptr %stack1)
+  %stack2 = call ptr @llvm.stacksave.p0()
+  %vla2 = alloca i32, i64 %a, align 16
+  %arrayidx3 = getelementptr inbounds [42 x i32], ptr %vector1, i64 0, i64 %b
+  %elemt = load i32, ptr %arrayidx3, align 4
+  %add = add nsw i32 %elemt, %elem1
+  %arrayidx4 = getelementptr inbounds i32, ptr %vla2, i64 %b
+  %elem2 = load i32, ptr %arrayidx4, align 4
+  %add5 = add nsw i32 %add, %elem2
+  call void @llvm.stackrestore.p0(ptr %stack2)
+  call void @llvm.lifetime.end.p0(i64 168, ptr nonnull %vector1)
+  ret i32 %add5
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
+declare ptr @llvm.stacksave.p0()
+declare void @llvm.stackrestore.p0(ptr)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/vararr_spec_const.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/vararr_spec_const.ll
new file mode 100644
index 000000000000..fbac43e51f39
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/vararr_spec_const.ll
@@ -0,0 +1,110 @@
+; Modified from: https://github.com/KhronosGroup/SPIRV-LLVM-Translator/test/extensions/INTEL/SPV_INTEL_variable_length_array/vla_spec_const.ll
+
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-extensions=SPV_INTEL_variable_length_array %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown --spirv-extensions=SPV_INTEL_variable_length_array %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV: Capability VariableLengthArrayINTEL
+; CHECK-SPIRV: Extension "SPV_INTEL_variable_length_array"
+; CHECK-SPIRV: OpDecorate %[[SpecConst:.*]] SpecId 0
+; CHECK-SPIRV-DAG: %[[Long:.*]] = OpTypeInt 64 0
+; CHECK-SPIRV-DAG: %[[Int:.*]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG: %[[IntPtr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[Int]]
+; CHECK-SPIRV: %[[SpecConst]] = OpSpecConstant %[[Long]]
+; CHECK-SPIRV-LABEL: FunctionEnd
+; CHECK-SPIRV: %[[SpecConstVal:.*]] = OpFunctionCall %[[Long]]
+; CHECK-SPIRV: OpSaveMemoryINTEL
+; CHECK-SPIRV: OpVariableLengthArrayINTEL %[[IntPtr]] %[[SpecConstVal]]
+; CHECK-SPIRV: OpRestoreMemoryINTEL
+
+; CHECK-SPIRV: OpFunction %[[Long]]
+; CHECK-SPIRV: ReturnValue %[[SpecConst]]
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-linux"
+
+%"class._ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEEUlvE_.anon" = type { %"class._ZTSN2cl4sycl12experimental13spec_constantIm13MyUInt64ConstEE.cl::sycl::experimental::spec_constant" }
+%"class._ZTSN2cl4sycl12experimental13spec_constantIm13MyUInt64ConstEE.cl::sycl::experimental::spec_constant" = type { i8 }
+
+$_ZTS17SpecializedKernel = comdat any
+
+$_ZNK2cl4sycl12experimental13spec_constantIm13MyUInt64ConstE3getEv = comdat any
+
+; Function Attrs: norecurse
+define weak_odr dso_local spir_kernel void @_ZTS17SpecializedKernel() #0 comdat !kernel_arg_addr_space !4 !kernel_arg_access_qual !4 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !4 {
+entry:
+  %p = alloca %"class._ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEEUlvE_.anon", align 1
+  call void @llvm.lifetime.start.p0(i64 1, ptr %p) #4
+  %p4 = addrspacecast ptr %p to ptr addrspace(4)
+  call spir_func void @"_ZZZ4mainENK3$_0clERN2cl4sycl7handlerEENKUlvE_clEv"(ptr addrspace(4) %p4)
+  call void @llvm.lifetime.end.p0(i64 1, ptr %p) #4
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: inlinehint norecurse
+define internal spir_func void @"_ZZZ4mainENK3$_0clERN2cl4sycl7handlerEENKUlvE_clEv"(ptr addrspace(4) %this) #2 align 2 {
+entry:
+  %this.addr = alloca ptr addrspace(4), align 8
+  %saved_stack = alloca ptr, align 8
+  %__vla_expr0 = alloca i64, align 8
+  store ptr addrspace(4) %this, ptr %this.addr, align 8, !tbaa !5
+  %this1 = load ptr addrspace(4), ptr %this.addr, align 8
+  %call = call spir_func i64 @_ZNK2cl4sycl12experimental13spec_constantIm13MyUInt64ConstE3getEv(ptr addrspace(4) %this1)
+  %p = call ptr @llvm.stacksave.p0()
+  store ptr %p, ptr %saved_stack, align 8
+  %vla = alloca i32, i64 %call, align 4
+  store i64 %call, ptr %__vla_expr0, align 8
+  store i32 42, ptr %vla, align 4, !tbaa !9
+  %torestore = load ptr, ptr %saved_stack, align 8
+  call void @llvm.stackrestore.p0(ptr %torestore)
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: norecurse
+define linkonce_odr dso_local spir_func i64 @_ZNK2cl4sycl12experimental13spec_constantIm13MyUInt64ConstE3getEv(ptr addrspace(4) %this) #3 comdat align 2 {
+entry:
+  %this.addr = alloca ptr addrspace(4), align 8
+  %TName = alloca ptr addrspace(4), align 8
+  store ptr addrspace(4) %this, ptr %this.addr, align 8, !tbaa !5
+  call void @llvm.lifetime.start.p0(i64 8, ptr %TName) #4
+  %p = call i64 @_Z20__spirv_SpecConstantix(i32 0, i64 0), !SYCL_SPEC_CONST_SYM_ID !11
+  call void @llvm.lifetime.end.p0(i64 8, ptr %TName) #4
+  ret i64 %p
+}
+
+; Function Attrs: nounwind
+declare ptr @llvm.stacksave.p0() #4
+
+; Function Attrs: nounwind
+declare void @llvm.stackrestore.p0(ptr) #4
+
+declare i64 @_Z20__spirv_SpecConstantix(i32, i64)
+
+attributes #0 = { norecurse "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-module-id"="/work/intel/vla_spec_const.cpp" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind willreturn }
+attributes #2 = { inlinehint norecurse "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { norecurse "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!opencl.spir.version = !{!1}
+!spirv.Source = !{!2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{i32 4, i32 100000}
+!3 = !{!"clang version 12.0.0"}
+!4 = !{}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"any pointer", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C++ TBAA"}
+!9 = !{!10, !10, i64 0}
+!10 = !{!"int", !7, i64 0}
+!11 = !{!"_ZTS13MyUInt64Const", i32 0}
diff --git a/llvm/test/CodeGen/SPIRV/freeze.ll b/llvm/test/CodeGen/SPIRV/freeze.ll
new file mode 100644
index 000000000000..fe4335722fe8
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/freeze.ll
@@ -0,0 +1,37 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpName %[[Arg1:.*]] "arg1"
+; CHECK: OpName %[[Arg2:.*]] "arg2"
+; CHECK: OpName %[[NotAStaticPoison:.*]] "poison1"
+; CHECK: OpName %[[NotAStaticPoison]] "nil0"
+; CHECK: OpName %[[StaticPoisonIntFreeze:.*]] "nil1"
+; CHECK: OpName %[[StaticPoisonFloatFreeze:.*]] "nil2"
+; CHECK: OpName %[[Arg1]] "val1"
+; CHECK: OpName %[[Const100:.*]] "val2"
+; CHECK: OpName %[[Const100]] "val3"
+; CHECK: OpDecorate
+; CHECK-DAG: %[[FloatTy:.*]] = OpTypeFloat 32
+; CHECK-DAG: %[[ShortTy:.*]] = OpTypeInt 16 0
+; CHECK-DAG: %[[IntTy:.*]] = OpTypeInt 32 0
+; CHECK-DAG: %[[Undef:.*]] = OpUndef %[[ShortTy]]
+; CHECK-DAG: %[[Const100]] = OpConstant %[[IntTy]] 100
+; CHECK-DAG: %[[StaticPoisonIntFreeze]] = OpConstantNull %[[IntTy]]
+; CHECK-DAG: %[[StaticPoisonFloatFreeze]] = OpConstantNull %[[FloatTy]]
+; CHECK: %[[Arg1]] = OpFunctionParameter %[[FloatTy]]
+; CHECK: %[[NotAStaticPoison]] = OpIAdd %[[ShortTy]] %[[Arg2]] %[[Undef]]
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+define spir_func void @foo(float %arg1, i16 %arg2) {
+entry:
+  %poison1 = add i16 %arg2, undef
+  %nil0 = freeze i16 %poison1
+  %nil1 = freeze i32 undef
+  %nil2 = freeze float poison
+  %val1 = freeze float %arg1
+  %val2 = freeze i32 100
+  %val3 = freeze i32 %val2
+  ret void
+}
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll b/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll
index daf46c6eef02..4f33439db770 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll
@@ -1,5 +1,5 @@
-; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -mattr=+multivalue -wasm-emit-multivalue 2>&1 | FileCheck %s --check-prefix=EH
-; RUN: not --crash llc < %s -enable-emscripten-sjlj -mattr=+multivalue 2>&1 -wasm-emit-multivalue | FileCheck %s --check-prefix=SJLJ
+; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -mattr=+multivalue 2>&1 | FileCheck %s --check-prefix=EH
+; RUN: not --crash llc < %s -enable-emscripten-sjlj -mattr=+multivalue 2>&1 | FileCheck %s --check-prefix=SJLJ
 
 ; Currently multivalue returning functions are not supported in Emscripten EH /
 ; SjLj. Make sure they error out.
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir b/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir
index 4fadbd5f07e6..4b4661b14466 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir
+++ b/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=wasm32-unknown-unknown -mattr=+multivalue -wasm-emit-multivalue -run-pass=wasm-reg-stackify -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=wasm32-unknown-unknown -mattr=+multivalue -run-pass=wasm-reg-stackify -verify-machineinstrs %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20"
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll b/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll
index f4f93ac2f30c..52a8c686824d 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll
+++ b/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; NOTE: Test functions have been generated by multivalue-stackify.py.
 
-; RUN: llc < %s -verify-machineinstrs -mattr=+multivalue -wasm-emit-multivalue | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mattr=+multivalue | FileCheck %s
 
 ; Test that the multivalue stackification works
 
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue.ll b/llvm/test/CodeGen/WebAssembly/multivalue.ll
index 846691e5ff0c..675009c8f3e5 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue.ll
+++ b/llvm/test/CodeGen/WebAssembly/multivalue.ll
@@ -1,8 +1,7 @@
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+multivalue,+tail-call -wasm-emit-multivalue | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+reference-types,+multivalue,+tail-call -wasm-emit-multivalue | FileCheck --check-prefix REF %s
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mcpu=mvp -mattr=+multivalue,+tail-call -wasm-emit-multivalue | FileCheck %s --check-prefix REGS
-; RUN: llc < %s --filetype=obj -mcpu=mvp -mattr=+multivalue,+tail-call -wasm-emit-multivalue | obj2yaml | FileCheck %s --check-prefix OBJ
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+multivalue,+tail-call | FileCheck %s --check-prefix NO-MULTIVALUE
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+multivalue,+tail-call | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+reference-types,+multivalue,+tail-call | FileCheck --check-prefix REF %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mcpu=mvp -mattr=+multivalue,+tail-call | FileCheck %s --check-prefix REGS
+; RUN: llc < %s --filetype=obj -mcpu=mvp -mattr=+multivalue,+tail-call | obj2yaml | FileCheck %s --check-prefix OBJ
 
 ; Test that the multivalue calls, returns, function types, and block
 ; types work as expected.
@@ -20,7 +19,6 @@ declare void @use_i64(i64)
 ; CHECK-NEXT: i32.const 42{{$}}
 ; CHECK-NEXT: i64.const 42{{$}}
 ; CHECK-NEXT: end_function{{$}}
-; NO-MULTIVALUE-NOT: .functype pair_const () -> (i32, i64)
 define %pair @pair_const() {
   ret %pair { i32 42, i64 42 }
 }
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll b/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll
index 7bf37b59353a..47c5ae7b457d 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll
+++ b/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -verify-machineinstrs -mcpu=mvp -mattr=+multivalue -wasm-emit-multivalue | FileCheck %s --check-prefix=MULTIVALUE
+; RUN: llc < %s -verify-machineinstrs -mcpu=mvp -mattr=+multivalue | FileCheck %s --check-prefix=MULTIVALUE
 ; RUN: llc < %s -verify-machineinstrs -mcpu=mvp | FileCheck %s --check-prefix=NO_MULTIVALUE
 
 ; Test libcall signatures when multivalue is enabled and disabled
diff --git a/llvm/test/CodeGen/WebAssembly/ref-type-mem2local.ll b/llvm/test/CodeGen/WebAssembly/ref-type-mem2local.ll
new file mode 100644
index 000000000000..a38243ca218c
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/ref-type-mem2local.ll
@@ -0,0 +1,57 @@
+; RUN: opt < %s -wasm-ref-type-mem2local -S | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+%externref = type ptr addrspace(10)
+%funcref = type ptr addrspace(20)
+
+declare %externref @get_externref()
+declare %funcref @get_funcref()
+declare i32 @get_i32()
+declare void @take_externref(%externref)
+declare void @take_funcref(%funcref)
+declare void @take_i32(i32)
+
+; Reference type allocas should be moved to addrspace(1)
+; CHECK-LABEL: @test_ref_type_mem2local
+define void @test_ref_type_mem2local() {
+entry:
+  %alloc.externref = alloca %externref, align 1
+  %eref = call %externref @get_externref()
+  store %externref %eref, ptr %alloc.externref, align 1
+  %eref.loaded = load %externref, ptr %alloc.externref, align 1
+  call void @take_externref(%externref %eref.loaded)
+  ; CHECK:      %alloc.externref.var = alloca ptr addrspace(10), align 1, addrspace(1)
+  ; CHECK-NEXT: %eref = call ptr addrspace(10) @get_externref()
+  ; CHECK-NEXT: store ptr addrspace(10) %eref, ptr addrspace(1) %alloc.externref.var, align 1
+  ; CHECK-NEXT: %eref.loaded = load ptr addrspace(10), ptr addrspace(1) %alloc.externref.var, align 1
+  ; CHECK-NEXT: call void @take_externref(ptr addrspace(10) %eref.loaded)
+
+  %alloc.funcref = alloca %funcref, align 1
+  %fref = call %funcref @get_funcref()
+  store %funcref %fref, ptr %alloc.funcref, align 1
+  %fref.loaded = load %funcref, ptr %alloc.funcref, align 1
+  call void @take_funcref(%funcref %fref.loaded)
+  ; CHECK-NEXT: %alloc.funcref.var = alloca ptr addrspace(20), align 1, addrspace(1)
+  ; CHECK-NEXT: %fref = call ptr addrspace(20) @get_funcref()
+  ; CHECK-NEXT: store ptr addrspace(20) %fref, ptr addrspace(1) %alloc.funcref.var, align 1
+  ; CHECK-NEXT: %fref.loaded = load ptr addrspace(20), ptr addrspace(1) %alloc.funcref.var, align 1
+  ; CHECK-NEXT: call void @take_funcref(ptr addrspace(20) %fref.loaded)
+
+  ret void
+}
+
+; POD type allocas should stay the same
+; CHECK-LABEL: @test_pod_type
+define void @test_pod_type() {
+entry:
+  %alloc.i32 = alloca i32
+  %i32 = call i32 @get_i32()
+  store i32 %i32, ptr %alloc.i32
+  %i32.loaded = load i32, ptr %alloc.i32
+  call void @take_i32(i32 %i32.loaded)
+  ; CHECK: %alloc.i32 = alloca i32, align 4{{$}}
+  ; CHECK-NOT: addrspace(1)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll b/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll
index 7039e33c0093..cbb5bd09c239 100644
--- a/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll
@@ -160,6 +160,53 @@ define <16 x i8> @demandedelts_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>
   ret <16 x i8> %5
 }
 
+define <4 x float> @demandedbits_sitofp_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) {
+; SSE-LABEL: demandedbits_sitofp_blendvps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps %xmm0, %xmm3
+; SSE-NEXT:    cvtdq2ps %xmm2, %xmm0
+; SSE-NEXT:    blendvps %xmm0, %xmm1, %xmm3
+; SSE-NEXT:    movaps %xmm3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: demandedbits_sitofp_blendvps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtdq2ps %xmm2, %xmm2
+; AVX-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = sitofp <4 x i32> %a2 to <4 x float>
+  %sel = tail call noundef <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %cvt)
+  ret <4 x float> %sel
+}
+
+define <4 x float> @demandedbits_uitofp_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) {
+; SSE-LABEL: demandedbits_uitofp_blendvps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps %xmm0, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [1258291200,1258291200,1258291200,1258291200]
+; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
+; SSE-NEXT:    psrld $16, %xmm2
+; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2],mem[3],xmm2[4],mem[5],xmm2[6],mem[7]
+; SSE-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE-NEXT:    addps %xmm2, %xmm0
+; SSE-NEXT:    blendvps %xmm0, %xmm1, %xmm3
+; SSE-NEXT:    movaps %xmm3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: demandedbits_uitofp_blendvps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0],mem[1],xmm2[2],mem[3],xmm2[4],mem[5],xmm2[6],mem[7]
+; AVX-NEXT:    vpsrld $16, %xmm2, %xmm2
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2],mem[3],xmm2[4],mem[5],xmm2[6],mem[7]
+; AVX-NEXT:    vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vaddps %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = uitofp <4 x i32> %a2 to <4 x float>
+  %sel = tail call noundef <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %cvt)
+  ret <4 x float> %sel
+}
+
 define <2 x i64> @demandedbits_blendvpd(i64 %a0, i64 %a2, <2 x double> %a3) {
 ; SSE-LABEL: demandedbits_blendvpd:
 ; SSE:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/extract-concat.ll b/llvm/test/CodeGen/X86/extract-concat.ll
index 93dbe99882fe..e7415dcf229f 100644
--- a/llvm/test/CodeGen/X86/extract-concat.ll
+++ b/llvm/test/CodeGen/X86/extract-concat.ll
@@ -9,22 +9,17 @@ define void @foo(<4 x float> %in, ptr %out) {
 ; SSE2-LABEL: foo:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    shll $8, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    shll $16, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    orl $-16777216, %eax # imm = 0xFF000000
-; SSE2-NEXT:    movl %eax, (%rdi)
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movd %xmm0, (%rdi)
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: foo:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE42-NEXT:    movl $255, %eax
 ; SSE42-NEXT:    pinsrb $3, %eax, %xmm0
 ; SSE42-NEXT:    movd %xmm0, (%rdi)
@@ -33,7 +28,7 @@ define void @foo(<4 x float> %in, ptr %out) {
 ; AVX-LABEL: foo:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    movl $255, %eax
 ; AVX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, (%rdi)
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 2001fddfaac4..5f326b6d6998 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -1368,49 +1368,21 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
 define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
 ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt:
 ; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    movq %rdi, %rcx
-; CHECK-SSE-NEXT:    andb $31, %cl
-; CHECK-SSE-NEXT:    movl $8, %eax
-; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-SSE-NEXT:    shlq %cl, %rax
-; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = [-5.0E-1,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-SSE-NEXT:    divss %xmm1, %xmm0
+; CHECK-SSE-NEXT:    andl $31, %edi
+; CHECK-SSE-NEXT:    shll $23, %edi
+; CHECK-SSE-NEXT:    movl $-1115684864, %eax # imm = 0xBD800000
+; CHECK-SSE-NEXT:    subl %edi, %eax
+; CHECK-SSE-NEXT:    movd %eax, %xmm0
 ; CHECK-SSE-NEXT:    retq
 ;
-; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    movq %rdi, %rcx
-; CHECK-AVX2-NEXT:    andb $31, %cl
-; CHECK-AVX2-NEXT:    movl $8, %eax
-; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-AVX2-NEXT:    shlq %cl, %rax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [-5.0E-1,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movq %rdi, %rcx
-; CHECK-NO-FASTFMA-NEXT:    andb $31, %cl
-; CHECK-NO-FASTFMA-NEXT:    movl $8, %eax
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
-; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = [-5.0E-1,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fdiv_pow_shl_cnt:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    andb $31, %dil
-; CHECK-FMA-NEXT:    movl $8, %eax
-; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
-; CHECK-FMA-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = [-5.0E-1,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
+; CHECK-AVX-LABEL: fdiv_pow_shl_cnt:
+; CHECK-AVX:       # %bb.0:
+; CHECK-AVX-NEXT:    andl $31, %edi
+; CHECK-AVX-NEXT:    shll $23, %edi
+; CHECK-AVX-NEXT:    movl $-1115684864, %eax # imm = 0xBD800000
+; CHECK-AVX-NEXT:    subl %edi, %eax
+; CHECK-AVX-NEXT:    vmovd %eax, %xmm0
+; CHECK-AVX-NEXT:    retq
   %cnt = and i64 %cnt_in, 31
   %shl = shl i64 8, %cnt
   %conv = sitofp i64 %shl to float
diff --git a/llvm/test/CodeGen/X86/kshift.ll b/llvm/test/CodeGen/X86/kshift.ll
index 0acf82f5a144..f4efacc1946c 100644
--- a/llvm/test/CodeGen/X86/kshift.ll
+++ b/llvm/test/CodeGen/X86/kshift.ll
@@ -270,11 +270,10 @@ define i64 @kshiftl_v64i1_63(<64 x i8> %x, <64 x i8> %y) {
 ; KNL-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
-; KNL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; KNL-NEXT:    vpcmpeqb %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
 ; KNL-NEXT:    kmovw %k0, %eax
@@ -564,14 +563,13 @@ define i64 @kshiftr_v64i1_63(<64 x i8> %x, <64 x i8> %y) {
 ; KNL-LABEL: kshiftr_v64i1_63:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; KNL-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; KNL-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k1
-; KNL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; KNL-NEXT:    vpcmpeqb %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
 ; KNL-NEXT:    kmovw %k0, %eax
diff --git a/llvm/test/CodeGen/X86/pr46455.ll b/llvm/test/CodeGen/X86/pr46455.ll
index 092e417c812e..3d1b04f74357 100644
--- a/llvm/test/CodeGen/X86/pr46455.ll
+++ b/llvm/test/CodeGen/X86/pr46455.ll
@@ -4,10 +4,10 @@
 define void @EntryModule(ptr %buffer_table) {
 ; CHECK-LABEL: EntryModule:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    movq (%rdi), %rax
 ; CHECK-NEXT:    movq 24(%rdi), %rcx
-; CHECK-NEXT:    vcmpneqps (%rax), %ymm0, %ymm0
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vcmpneqps (%rax), %xmm0, %xmm0
 ; CHECK-NEXT:    vpsrld $31, %xmm0, %xmm1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
@@ -16,7 +16,6 @@ define void @EntryModule(ptr %buffer_table) {
 ; CHECK-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovd %xmm0, (%rcx)
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
 entry:
   %i1 = load ptr, ptr %buffer_table, align 8
diff --git a/llvm/test/CodeGen/X86/setcc-lowering.ll b/llvm/test/CodeGen/X86/setcc-lowering.ll
index 705e48ca4c9c..90e5c279d2e1 100644
--- a/llvm/test/CodeGen/X86/setcc-lowering.ll
+++ b/llvm/test/CodeGen/X86/setcc-lowering.ll
@@ -1,22 +1,35 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX
-; RUN: llc -mtriple=i386-unknown-linux-gnu -mcpu=knl < %s   | FileCheck %s --check-prefix=KNL-32
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx  < %s | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx2 < %s | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mcpu=knl < %s   | FileCheck %s --check-prefixes=AVX,KNL-32
 
 
 ; Verify that we don't crash during codegen due to a wrong lowering
 ; of a setcc node with illegal operand types and return type.
 
 define <8 x i16> @pr25080(<8 x i32> %a) {
-; AVX-LABEL: pr25080:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVX1-LABEL: pr25080:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: pr25080:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [8388607,8388607,8388607,8388607]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
 ; KNL-32-LABEL: pr25080:
 ; KNL-32:       # %bb.0: # %entry
@@ -38,23 +51,40 @@ entry:
 }
 
 define void @pr26232(i64 %a, <16 x i1> %b) {
-; AVX-LABEL: pr26232:
-; AVX:       # %bb.0: # %allocas
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    .p2align 4, 0x90
-; AVX-NEXT:  .LBB1_1: # %for_loop599
-; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX-NEXT:    cmpq $65536, %rdi # imm = 0x10000
-; AVX-NEXT:    setl %al
-; AVX-NEXT:    vmovd %eax, %xmm2
-; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX-NEXT:    vpand %xmm0, %xmm2, %xmm2
-; AVX-NEXT:    vpsllw $7, %xmm2, %xmm2
-; AVX-NEXT:    vpmovmskb %xmm2, %eax
-; AVX-NEXT:    testl %eax, %eax
-; AVX-NEXT:    jne .LBB1_1
-; AVX-NEXT:  # %bb.2: # %for_exit600
-; AVX-NEXT:    retq
+; AVX1-LABEL: pr26232:
+; AVX1:       # %bb.0: # %allocas
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    .p2align 4, 0x90
+; AVX1-NEXT:  .LBB1_1: # %for_loop599
+; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
+; AVX1-NEXT:    cmpq $65536, %rdi # imm = 0x10000
+; AVX1-NEXT:    setl %al
+; AVX1-NEXT:    vmovd %eax, %xmm2
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $7, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovmskb %xmm2, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    jne .LBB1_1
+; AVX1-NEXT:  # %bb.2: # %for_exit600
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: pr26232:
+; AVX2:       # %bb.0: # %allocas
+; AVX2-NEXT:    .p2align 4, 0x90
+; AVX2-NEXT:  .LBB1_1: # %for_loop599
+; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
+; AVX2-NEXT:    cmpq $65536, %rdi # imm = 0x10000
+; AVX2-NEXT:    setl %al
+; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm0, %xmm1, %xmm1
+; AVX2-NEXT:    vpsllw $7, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovmskb %xmm1, %eax
+; AVX2-NEXT:    testl %eax, %eax
+; AVX2-NEXT:    jne .LBB1_1
+; AVX2-NEXT:  # %bb.2: # %for_exit600
+; AVX2-NEXT:    retq
 ;
 ; KNL-32-LABEL: pr26232:
 ; KNL-32:       # %bb.0: # %allocas
@@ -108,14 +138,7 @@ define <4 x i32> @pcmpgt(<4 x i8> %x) {
 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
-;
-; KNL-32-LABEL: pcmpgt:
-; KNL-32:       # %bb.0:
-; KNL-32-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; KNL-32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL-32-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; KNL-32-NEXT:    retl
+; AVX-NEXT:    ret{{[l|q]}}
   %zext = zext <4 x i8> %x to <4 x i32>
   %icmp = icmp ne <4 x i32> %zext, zeroinitializer
   %sext = sext <4 x i1> %icmp to <4 x i32>
diff --git a/llvm/test/CodeGen/X86/vec_anyext.ll b/llvm/test/CodeGen/X86/vec_anyext.ll
index 09e4a4b3a773..e229165be967 100644
--- a/llvm/test/CodeGen/X86/vec_anyext.ll
+++ b/llvm/test/CodeGen/X86/vec_anyext.ll
@@ -112,27 +112,10 @@ define <4 x i8> @func_8_16(ptr %a, ptr %b) nounwind {
 ;
 ; X64-LABEL: func_8_16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    vmovd %eax, %xmm0
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    shrl $16, %ecx
-; X64-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    shrq $32, %rcx
-; X64-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; X64-NEXT:    shrq $48, %rax
-; X64-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; X64-NEXT:    movq (%rsi), %rax
-; X64-NEXT:    vmovd %eax, %xmm1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    shrl $16, %ecx
-; X64-NEXT:    vpinsrb $1, %ecx, %xmm1, %xmm1
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    shrq $32, %rcx
-; X64-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
-; X64-NEXT:    shrq $48, %rax
-; X64-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; X64-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; X64-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
 ; X64-NEXT:    retq
   %F = load <4 x i16>, ptr %a
   %G = trunc <4 x i16> %F to <4 x i8>
diff --git a/llvm/test/CodeGen/X86/vec_cast.ll b/llvm/test/CodeGen/X86/vec_cast.ll
index e0089354cc95..0a6bc2f59b68 100644
--- a/llvm/test/CodeGen/X86/vec_cast.ll
+++ b/llvm/test/CodeGen/X86/vec_cast.ll
@@ -156,7 +156,7 @@ define <3 x i16> @h(<3 x i32> %a) nounwind {
 ; CHECK-WIN-LABEL: h:
 ; CHECK-WIN:       # %bb.0:
 ; CHECK-WIN-NEXT:    movdqa (%rcx), %xmm0
-; CHECK-WIN-NEXT:    movl (%rcx), %eax
+; CHECK-WIN-NEXT:    movd %xmm0, %eax
 ; CHECK-WIN-NEXT:    pextrw $2, %xmm0, %edx
 ; CHECK-WIN-NEXT:    pextrw $4, %xmm0, %ecx
 ; CHECK-WIN-NEXT:    # kill: def $ax killed $ax killed $eax
diff --git a/llvm/test/CodeGen/X86/x86-32-intrcc.ll b/llvm/test/CodeGen/X86/x86-32-intrcc.ll
index 2e482753e268..3c3944c2082b 100644
--- a/llvm/test/CodeGen/X86/x86-32-intrcc.ll
+++ b/llvm/test/CodeGen/X86/x86-32-intrcc.ll
@@ -149,7 +149,6 @@ define x86_intrcc void @test_isr_x87(ptr byval(%struct.interrupt_frame) %frame)
 ; CHECK-NEXT:    pushl %ebp
 ; CHECK-NEXT:    movl %esp, %ebp
 ; CHECK-NEXT:    andl $-16, %esp
-; CHECK-NEXT:    cld
 ; CHECK-NEXT:    fldt f80
 ; CHECK-NEXT:    fld1
 ; CHECK-NEXT:    faddp %st, %st(1)
@@ -163,7 +162,6 @@ define x86_intrcc void @test_isr_x87(ptr byval(%struct.interrupt_frame) %frame)
 ; CHECK0-NEXT:    pushl %ebp
 ; CHECK0-NEXT:    movl %esp, %ebp
 ; CHECK0-NEXT:    andl $-16, %esp
-; CHECK0-NEXT:    cld
 ; CHECK0-NEXT:    fldt f80
 ; CHECK0-NEXT:    fld1
 ; CHECK0-NEXT:    faddp %st, %st(1)
@@ -188,7 +186,6 @@ define dso_local x86_intrcc void @test_fp_1(ptr byval(%struct.interrupt_frame) %
 ; CHECK-NEXT:    pushl %ecx
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    andl $-16, %esp
-; CHECK-NEXT:    cld
 ; CHECK-NEXT:    leal 20(%ebp), %eax
 ; CHECK-NEXT:    leal 4(%ebp), %ecx
 ; CHECK-NEXT:    movl %ecx, sink_address
@@ -206,7 +203,6 @@ define dso_local x86_intrcc void @test_fp_1(ptr byval(%struct.interrupt_frame) %
 ; CHECK0-NEXT:    pushl %ecx
 ; CHECK0-NEXT:    pushl %eax
 ; CHECK0-NEXT:    andl $-16, %esp
-; CHECK0-NEXT:    cld
 ; CHECK0-NEXT:    leal 4(%ebp), %ecx
 ; CHECK0-NEXT:    movl %ecx, %eax
 ; CHECK0-NEXT:    addl $16, %eax
@@ -234,7 +230,6 @@ define dso_local x86_intrcc void @test_fp_2(ptr byval(%struct.interrupt_frame) %
 ; CHECK-NEXT:    pushl %ecx
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    andl $-16, %esp
-; CHECK-NEXT:    cld
 ; CHECK-NEXT:    movl 4(%ebp), %eax
 ; CHECK-NEXT:    leal 24(%ebp), %ecx
 ; CHECK-NEXT:    leal 8(%ebp), %edx
@@ -257,7 +252,6 @@ define dso_local x86_intrcc void @test_fp_2(ptr byval(%struct.interrupt_frame) %
 ; CHECK0-NEXT:    pushl %ecx
 ; CHECK0-NEXT:    pushl %eax
 ; CHECK0-NEXT:    andl $-16, %esp
-; CHECK0-NEXT:    cld
 ; CHECK0-NEXT:    movl 4(%ebp), %eax
 ; CHECK0-NEXT:    leal 8(%ebp), %edx
 ; CHECK0-NEXT:    movl %edx, %ecx
@@ -288,7 +282,6 @@ define x86_intrcc void @test_copy_elide(ptr byval(%struct.interrupt_frame) %fram
 ; CHECK-NEXT:    movl %esp, %ebp
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    andl $-16, %esp
-; CHECK-NEXT:    cld
 ; CHECK-NEXT:    leal 4(%ebp), %eax
 ; CHECK-NEXT:    movl %eax, sink_address
 ; CHECK-NEXT:    leal -4(%ebp), %esp
@@ -303,7 +296,6 @@ define x86_intrcc void @test_copy_elide(ptr byval(%struct.interrupt_frame) %fram
 ; CHECK0-NEXT:    movl %esp, %ebp
 ; CHECK0-NEXT:    pushl %eax
 ; CHECK0-NEXT:    andl $-16, %esp
-; CHECK0-NEXT:    cld
 ; CHECK0-NEXT:    movl 4(%ebp), %eax
 ; CHECK0-NEXT:    leal 4(%ebp), %eax
 ; CHECK0-NEXT:    movl %eax, sink_address
@@ -358,7 +350,6 @@ define x86_intrcc void @test_isr_realign(ptr byval(%struct.interrupt_frame) %fra
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    andl $-32, %esp
 ; CHECK-NEXT:    subl $32, %esp
-; CHECK-NEXT:    cld
 ; CHECK-NEXT:    movl 4(%ebp), %eax
 ; CHECK-NEXT:    movl %eax, (%esp)
 ; CHECK-NEXT:    leal -4(%ebp), %esp
@@ -374,7 +365,6 @@ define x86_intrcc void @test_isr_realign(ptr byval(%struct.interrupt_frame) %fra
 ; CHECK0-NEXT:    pushl %eax
 ; CHECK0-NEXT:    andl $-32, %esp
 ; CHECK0-NEXT:    subl $32, %esp
-; CHECK0-NEXT:    cld
 ; CHECK0-NEXT:    movl 4(%ebp), %eax
 ; CHECK0-NEXT:    movl %eax, (%esp)
 ; CHECK0-NEXT:    leal -4(%ebp), %esp
diff --git a/llvm/test/CodeGen/X86/x86-64-intrcc-uintr.ll b/llvm/test/CodeGen/X86/x86-64-intrcc-uintr.ll
index a46b9d9ba5a1..1fe395b84d46 100644
--- a/llvm/test/CodeGen/X86/x86-64-intrcc-uintr.ll
+++ b/llvm/test/CodeGen/X86/x86-64-intrcc-uintr.ll
@@ -21,28 +21,24 @@ define dso_local x86_intrcc void @test_uintr_isr_cc_empty(ptr nocapture byval(%s
 ; CHECK-USER-LABEL: test_uintr_isr_cc_empty:
 ; CHECK-USER:       # %bb.0: # %entry
 ; CHECK-USER-NEXT:    pushq %rax
-; CHECK-USER-NEXT:    cld
 ; CHECK-USER-NEXT:    addq $16, %rsp
 ; CHECK-USER-NEXT:    uiret
 ;
 ; CHECK0-USER-LABEL: test_uintr_isr_cc_empty:
 ; CHECK0-USER:       # %bb.0: # %entry
 ; CHECK0-USER-NEXT:    pushq %rax
-; CHECK0-USER-NEXT:    cld
 ; CHECK0-USER-NEXT:    addq $16, %rsp
 ; CHECK0-USER-NEXT:    uiret
 ;
 ; CHECK-KERNEL-LABEL: test_uintr_isr_cc_empty:
 ; CHECK-KERNEL:       # %bb.0: # %entry
 ; CHECK-KERNEL-NEXT:    pushq %rax
-; CHECK-KERNEL-NEXT:    cld
 ; CHECK-KERNEL-NEXT:    addq $16, %rsp
 ; CHECK-KERNEL-NEXT:    iretq
 ;
 ; CHECK0-KERNEL-LABEL: test_uintr_isr_cc_empty:
 ; CHECK0-KERNEL:       # %bb.0: # %entry
 ; CHECK0-KERNEL-NEXT:    pushq %rax
-; CHECK0-KERNEL-NEXT:    cld
 ; CHECK0-KERNEL-NEXT:    addq $16, %rsp
 ; CHECK0-KERNEL-NEXT:    iretq
 entry:
@@ -75,7 +71,6 @@ define dso_local x86_intrcc void @test_uintr_isr_cc_args(ptr nocapture readonly
 ; CHECK-USER-NEXT:    pushq %rax
 ; CHECK-USER-NEXT:    pushq %rdx
 ; CHECK-USER-NEXT:    pushq %rcx
-; CHECK-USER-NEXT:    cld
 ; CHECK-USER-NEXT:    movq 32(%rsp), %rax
 ; CHECK-USER-NEXT:    movq 40(%rsp), %rcx
 ; CHECK-USER-NEXT:    movq 48(%rsp), %rdx
@@ -96,7 +91,6 @@ define dso_local x86_intrcc void @test_uintr_isr_cc_args(ptr nocapture readonly
 ; CHECK0-USER-NEXT:    pushq %rax
 ; CHECK0-USER-NEXT:    pushq %rdx
 ; CHECK0-USER-NEXT:    pushq %rcx
-; CHECK0-USER-NEXT:    cld
 ; CHECK0-USER-NEXT:    movq 32(%rsp), %rax
 ; CHECK0-USER-NEXT:    leaq 40(%rsp), %rcx
 ; CHECK0-USER-NEXT:    movq (%rcx), %rdx
@@ -118,7 +112,6 @@ define dso_local x86_intrcc void @test_uintr_isr_cc_args(ptr nocapture readonly
 ; CHECK-KERNEL-NEXT:    pushq %rax
 ; CHECK-KERNEL-NEXT:    pushq %rdx
 ; CHECK-KERNEL-NEXT:    pushq %rcx
-; CHECK-KERNEL-NEXT:    cld
 ; CHECK-KERNEL-NEXT:    movq 32(%rsp), %rax
 ; CHECK-KERNEL-NEXT:    movq 40(%rsp), %rcx
 ; CHECK-KERNEL-NEXT:    movq 48(%rsp), %rdx
@@ -139,7 +132,6 @@ define dso_local x86_intrcc void @test_uintr_isr_cc_args(ptr nocapture readonly
 ; CHECK0-KERNEL-NEXT:    pushq %rax
 ; CHECK0-KERNEL-NEXT:    pushq %rdx
 ; CHECK0-KERNEL-NEXT:    pushq %rcx
-; CHECK0-KERNEL-NEXT:    cld
 ; CHECK0-KERNEL-NEXT:    movq 32(%rsp), %rax
 ; CHECK0-KERNEL-NEXT:    leaq 40(%rsp), %rcx
 ; CHECK0-KERNEL-NEXT:    movq (%rcx), %rdx
diff --git a/llvm/test/CodeGen/X86/x86-64-intrcc.ll b/llvm/test/CodeGen/X86/x86-64-intrcc.ll
index 443d4c2fa464..5fc606eb566e 100644
--- a/llvm/test/CodeGen/X86/x86-64-intrcc.ll
+++ b/llvm/test/CodeGen/X86/x86-64-intrcc.ll
@@ -114,7 +114,6 @@ define dso_local x86_intrcc void @test_fp_1(ptr byval(%struct.interrupt_frame) %
   ; CHECK: # %bb.0: # %entry
   ; CHECK-NEXT: pushq %rbp
   ; CHECK-NEXT: movq %rsp, %rbp
-  ; CHECK: cld
   ; CHECK-DAG: leaq 8(%rbp), %[[R1:[^ ]*]]
   ; CHECK-DAG: leaq 40(%rbp), %[[R2:[^ ]*]]
   ; CHECK: movq %[[R1]], sink_address
@@ -136,7 +135,6 @@ define dso_local x86_intrcc void @test_fp_2(ptr byval(%struct.interrupt_frame) %
   ; CHECK-NEXT: pushq %rax
   ; CHECK-NEXT: pushq %rbp
   ; CHECK-NEXT: movq %rsp, %rbp
-  ; CHECK: cld
   ; CHECK-DAG: movq 16(%rbp), %[[R3:[^ ]*]]
   ; CHECK-DAG: leaq 24(%rbp), %[[R1:[^ ]*]]
   ; CHECK-DAG: leaq 56(%rbp), %[[R2:[^ ]*]]
@@ -164,7 +162,6 @@ define x86_intrcc void @test_copy_elide(ptr byval(%struct.interrupt_frame) %fram
   ; CHECK-NEXT: pushq %rax
   ; CHECK-NEXT: pushq %rbp
   ; CHECK-NEXT: movq %rsp, %rbp
-  ; CHECK: cld
   ; CHECK: leaq 16(%rbp), %[[R1:[^ ]*]]
   ; CHECK: movq %[[R1]], sink_address(%rip)
 entry:
diff --git a/llvm/test/DebugInfo/Generic/inline-alloca-ordering.ll b/llvm/test/DebugInfo/Generic/inline-alloca-ordering.ll
index 9ff3d80af80d..9f401ceb5b6f 100644
--- a/llvm/test/DebugInfo/Generic/inline-alloca-ordering.ll
+++ b/llvm/test/DebugInfo/Generic/inline-alloca-ordering.ll
@@ -15,8 +15,6 @@
 ;; doesn't transfer the dbg.value to the entry block. This needs Special
 ;; Handling once we get rid of debug-intrinsics.
 
-; CHECK: declare void @llvm.dbg.value(metadata,
-
 ; CHECK:    define i32 @bar()
 ; CHECK-NEXT: %1 = alloca [65 x i32], align 16
 ; CHECK-NEXT: call void @ext()
@@ -24,9 +22,10 @@
 ; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 0, metadata !10, metadata !DIExpression()), !dbg !12
 ; CHECK-NEXT: call void @init(ptr %1)
 
+; CHECK: declare void @llvm.dbg.value(metadata,
+
 declare void @ext()
 declare void @init(ptr)
-declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 define internal i32 @foo() !dbg !4 {
   %1 = alloca [65 x i32], align 16
@@ -42,6 +41,8 @@ define i32 @bar() !dbg !16 {
   ret i32 %1
 }
 
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
diff --git a/llvm/test/DebugInfo/Generic/inline-dbg-values.ll b/llvm/test/DebugInfo/Generic/inline-dbg-values.ll
index 7d580d6fba37..b0390b9f78f4 100644
--- a/llvm/test/DebugInfo/Generic/inline-dbg-values.ll
+++ b/llvm/test/DebugInfo/Generic/inline-dbg-values.ll
@@ -7,8 +7,6 @@
 ;;
 ;; test should be inlined into test2
 
-; CHECK: declare void @llvm.dbg.value(metadata,
-
 ; CHECK: define i32 @test2
 ; CHECK-NEXT: entry:
 ; CHECK:      %k.addr.i = alloca i32, align 4
@@ -47,6 +45,8 @@
 ; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 1, metadata ![[KVAR]], metadata !DIExpression()), !dbg ![[KLINE]]
 ; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 1, metadata ![[K2VAR]], metadata !DIExpression()), !dbg ![[GLINE]]
 ;
+; CHECK: declare void @llvm.dbg.value(metadata,
+;
 ;; Test that the metadata maps onto the correct things, and that the DILocations
 ;; attached to the intrinsics have been inlined.
 ;
@@ -100,8 +100,6 @@ return:                                           ; preds = %if.end, %if.then
   ret i32 %3, !dbg !20
 }
 
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
 declare i32 @_Z8test_exti(i32)
 
 define i32 @test2(i32 %foo, i32 %bar) !dbg !10 {
@@ -118,6 +116,8 @@ try.cont:                                         ; preds = %catch, %invoke.cont
   ret i32 0, !dbg !30
 }
 
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!31}
 
diff --git a/llvm/test/DebugInfo/dpvalue-print-nocrash.ll b/llvm/test/DebugInfo/dpvalue-print-nocrash.ll
index 9f120af13ac9..0a618c6780d1 100755
--- a/llvm/test/DebugInfo/dpvalue-print-nocrash.ll
+++ b/llvm/test/DebugInfo/dpvalue-print-nocrash.ll
@@ -2,8 +2,7 @@
 ; RUN: opt -passes="instcombine" -debug %s -o /dev/null 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
-; CHECK: CLONE:   DPValue value {
-; CHECK-SAME: marker @0x0
+; CHECK: CLONE:   #dbg_value(
 
 define ptr @func_10(i32 %p_11) {
 entry:
diff --git a/llvm/test/DebugInfo/print-non-instruction-debug-info.ll b/llvm/test/DebugInfo/print-non-instruction-debug-info.ll
new file mode 100644
index 000000000000..f8271df146fe
--- /dev/null
+++ b/llvm/test/DebugInfo/print-non-instruction-debug-info.ll
@@ -0,0 +1,86 @@
+;; Test that we can write in the new debug info format.
+; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=false --write-experimental-debuginfo=false < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,OLDDBG --implicit-check-not=llvm.dbg
+; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=false --write-experimental-debuginfo=true < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,NEWDBG --implicit-check-not=llvm.dbg
+
+;; Test also that the new flag is independent of the flag that enables use of
+;; these non-instruction debug info during LLVM passes.
+; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=true --write-experimental-debuginfo=false < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,OLDDBG --implicit-check-not=llvm.dbg
+; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=true --write-experimental-debuginfo=true < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,NEWDBG --implicit-check-not=llvm.dbg
+
+; CHECK: @f(i32 %[[VAL_A:[0-9a-zA-Z]+]])
+; CHECK-NEXT: entry:
+; OLDDBG-NEXT: call void @llvm.dbg.value(metadata i32 %[[VAL_A]], metadata ![[VAR_A:[0-9]+]], metadata !DIExpression()), !dbg ![[LOC_1:[0-9]+]]
+; NEWDBG-NEXT: {{^}}    #dbg_value(i32 %[[VAL_A]], ![[VAR_A:[0-9]+]], !DIExpression(), ![[LOC_1:[0-9]+]])
+; CHECK-NEXT: {{^}}  %[[VAL_B:[0-9a-zA-Z]+]] = alloca
+; OLDDBG-NEXT: call void @llvm.dbg.declare(metadata ptr %[[VAL_B]], metadata ![[VAR_B:[0-9]+]], metadata !DIExpression()), !dbg ![[LOC_2:[0-9]+]]
+; NEWDBG-NEXT: {{^}}    #dbg_declare(ptr %[[VAL_B]], ![[VAR_B:[0-9]+]], !DIExpression(), ![[LOC_2:[0-9]+]])
+; CHECK-NEXT: {{^}}  %[[VAL_ADD:[0-9a-zA-Z]+]] = add i32 %[[VAL_A]], 5
+; OLDDBG-NEXT: call void @llvm.dbg.value(metadata !DIArgList(i32 %[[VAL_A]], i32 %[[VAL_ADD]]), metadata ![[VAR_A]], metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus)), !dbg ![[LOC_3:[0-9]+]]
+; NEWDBG-NEXT: {{^}}    #dbg_value(!DIArgList(i32 %[[VAL_A]], i32 %[[VAL_ADD]]), ![[VAR_A]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), ![[LOC_3:[0-9]+]])
+; OLDDBG-NEXT: call void @llvm.dbg.label(metadata ![[LABEL_ID:[0-9]+]])
+; NEWDBG-NEXT: {{^}}    #dbg_label(![[LABEL_ID:[0-9]+]])
+; CHECK-NEXT: {{^}}  store i32 %[[VAL_ADD]]{{.+}}, !DIAssignID ![[ASSIGNID:[0-9]+]]
+; OLDDBG-NEXT: call void @llvm.dbg.assign(metadata i32 %[[VAL_ADD]], metadata ![[VAR_B]], metadata !DIExpression(), metadata ![[ASSIGNID]], metadata ptr %[[VAL_B]], metadata !DIExpression()), !dbg ![[LOC_4:[0-9]+]]
+; NEWDBG-NEXT: {{^}}    #dbg_assign(i32 %[[VAL_ADD]], ![[VAR_B]], !DIExpression(), ![[ASSIGNID]], ptr %[[VAL_B]], !DIExpression(), ![[LOC_4:[0-9]+]])
+; CHECK-NEXT: {{^}}  ret i32
+
+; OLDDBG-DAG: declare void @llvm.dbg.value
+; OLDDBG-DAG: declare void @llvm.dbg.declare
+; OLDDBG-DAG: declare void @llvm.dbg.assign
+
+; CHECK-DAG: llvm.dbg.cu
+; CHECK-DAG: ![[VAR_A]] = !DILocalVariable(name: "a"
+; CHECK-DAG: ![[VAR_B]] = !DILocalVariable(name: "b"
+; CHECK-DAG: ![[LOC_1]] = !DILocation(line: 3, column: 15
+; CHECK-DAG: ![[LOC_2]] = !DILocation(line: 3, column: 20
+; CHECK-DAG: ![[LOC_3]] = !DILocation(line: 3, column: 25
+; CHECK-DAG: ![[LOC_4]] = !DILocation(line: 3, column: 30
+; CHECK-DAG: ![[LABEL_ID]] = !DILabel(
+
+define dso_local i32 @f(i32 %a) !dbg !7 {
+entry:
+  call void @llvm.dbg.value(metadata i32 %a, metadata !20, metadata !DIExpression()), !dbg !30
+  %b = alloca i32, !dbg !30, !DIAssignID !40
+  call void @llvm.dbg.declare(metadata ptr %b, metadata !21, metadata !DIExpression()), !dbg !31
+  %add = add i32 %a, 5, !dbg !31
+  call void @llvm.dbg.value(metadata !DIArgList(i32 %a, i32 %add), metadata !20, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus)), !dbg !32
+  call void @llvm.dbg.label(metadata !50), !dbg !32
+  store i32 %add, ptr %b, !dbg !32, !DIAssignID !40
+  call void @llvm.dbg.assign(metadata i32 %add, metadata !21, metadata !DIExpression(), metadata !40, metadata ptr %b, metadata !DIExpression()), !dbg !33
+  ret i32 %add, !dbg !33
+
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+declare void @llvm.dbg.label(metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 18.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "print.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 5}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 18.0.0"}
+!7 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 3, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !13)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!12, !12}
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !{!20, !21}
+!20 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 3, type: !12)
+!21 = !DILocalVariable(name: "b", scope: !7, file: !1, line: 3, type: !12)
+!30 = !DILocation(line: 3, column: 15, scope: !7)
+!31 = !DILocation(line: 3, column: 20, scope: !7)
+!32 = !DILocation(line: 3, column: 25, scope: !7)
+!33 = !DILocation(line: 3, column: 30, scope: !7)
+!40 = distinct !DIAssignID()
+!50 = !DILabel(scope: !7, name: "label", file: !1, line: 3)
diff --git a/llvm/test/Instrumentation/AddressSanitizer/do-not-instrument-globals-windows.ll b/llvm/test/Instrumentation/AddressSanitizer/do-not-instrument-globals-windows.ll
new file mode 100644
index 000000000000..c143f69f126a
--- /dev/null
+++ b/llvm/test/Instrumentation/AddressSanitizer/do-not-instrument-globals-windows.ll
@@ -0,0 +1,10 @@
+; This test checks that we are not instrumenting unnecessary globals
+; RUN: opt < %s -passes=asan -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+@v_available_externally = available_externally global i32 zeroinitializer
+; CHECK-NOT: {{asan_gen.*v_available_externally}}
+
+; CHECK: @asan.module_ctor
diff --git a/llvm/test/Instrumentation/InstrProfiling/coverage.ll b/llvm/test/Instrumentation/InstrProfiling/coverage.ll
index bbf895ea4b34..08cbcaa962b7 100644
--- a/llvm/test/Instrumentation/InstrProfiling/coverage.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/coverage.ll
@@ -5,12 +5,12 @@ target triple = "aarch64-unknown-linux-gnu"
 
 @__profn_foo = private constant [3 x i8] c"foo"
 ; CHECK: @__profc_foo = private global [1 x i8] c"\FF", section "__llvm_prf_cnts", comdat, align 1
-; CHECK: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_foo to i64)
-; BINARY: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_foo to i64),
+; CHECK: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_foo to i64)
+; BINARY: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_foo to i64),
 @__profn_bar = private constant [3 x i8] c"bar"
 ; CHECK: @__profc_bar = private global [1 x i8] c"\FF", section "__llvm_prf_cnts", comdat, align 1
-; CHECK: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_bar to i64)
-; BINARY: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_bar to i64),
+; CHECK: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_bar to i64)
+; BINARY: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_bar to i64),
 
 ; CHECK: @__llvm_prf_nm = {{.*}} section "__llvm_prf_names"
 ; BINARY: @__llvm_prf_nm ={{.*}} section "__llvm_covnames"
diff --git a/llvm/test/MC/AMDGPU/gfx1011_err.s b/llvm/test/MC/AMDGPU/gfx1011_err.s
index 4b37aaf221e3..a86e48a29c78 100644
--- a/llvm/test/MC/AMDGPU/gfx1011_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1011_err.s
@@ -17,7 +17,7 @@ v_dot8c_i32_i4 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 s_getreg_b32 s2, hwreg(HW_REG_SHADER_CYCLES)
-// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 v_fma_legacy_f32 v0, v1, v2, v3
 // GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1030_err.s b/llvm/test/MC/AMDGPU/gfx1030_err.s
index ba8784a39c36..f4ab5fe5b14a 100644
--- a/llvm/test/MC/AMDGPU/gfx1030_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1030_err.s
@@ -25,7 +25,7 @@ s_get_waveid_in_workgroup s0
 // GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 s_getreg_b32 s2, hwreg(HW_REG_XNACK_MASK)
-// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 v_mac_f32 v0, v1, v2
 // GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx10_err_pos.s b/llvm/test/MC/AMDGPU/gfx10_err_pos.s
index 1d34f00ee0f9..c2679db3b2ac 100644
--- a/llvm/test/MC/AMDGPU/gfx10_err_pos.s
+++ b/llvm/test/MC/AMDGPU/gfx10_err_pos.s
@@ -448,7 +448,7 @@ ds_swizzle_b32 v8, v2 offset:SWZ(QUAD_PERM, 0, 1, 2, 3)
 // expected a hwreg macro or an absolute expression
 
 s_setreg_b32 undef, s2
-// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected a hwreg macro or an absolute expression
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected a hwreg macro, structured immediate or an absolute expression
 // CHECK-NEXT:{{^}}s_setreg_b32 undef, s2
 // CHECK-NEXT:{{^}}             ^
 
@@ -621,10 +621,10 @@ s_setreg_b32  hwreg(3,0,33), s2
 // CHECK-NEXT:{{^}}                        ^
 
 //==============================================================================
-// invalid code of hardware register: only 6-bit values are legal
+// invalid hardware register: only 6-bit values are legal
 
 s_setreg_b32  hwreg(0x40), s2
-// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: invalid code of hardware register: only 6-bit values are legal
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: only 6-bit values are legal
 // CHECK-NEXT:{{^}}s_setreg_b32  hwreg(0x40), s2
 // CHECK-NEXT:{{^}}                    ^
 
@@ -1158,10 +1158,10 @@ v_movrels_b32_sdwa v0, shared_base
 // CHECK-NEXT:{{^}}                       ^
 
 //==============================================================================
-// specified hardware register is not supported on this GPU
+// invalid hardware register: not supported on this GPU
 
 s_getreg_b32 s2, hwreg(HW_REG_SHADER_CYCLES)
-// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // CHECK-NEXT:{{^}}s_getreg_b32 s2, hwreg(HW_REG_SHADER_CYCLES)
 // CHECK-NEXT:{{^}}                       ^
 
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_alias.s b/llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_alias.s
index 187d18e9045a..eba48c602172 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_alias.s
@@ -1,403 +1,25 @@
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX11-ERR --implicit-check-not=error: %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
 
 tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_8_UNORM] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe8,0x00,0x04,0x02,0x03]
-
-tbuffer_load_format_d16_x v255, off, s[8:11], s3, format:1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe8,0x00,0xff,0x02,0x03]
-
-tbuffer_load_format_d16_x v4, off, s[12:15], s3, format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_UNORM] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe8,0x00,0x04,0x03,0x03]
-
-tbuffer_load_format_d16_x v4, off, s[12:15], s101, format:[BUF_FMT_8_SNORM] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe8,0x00,0x04,0x03,0x65]
-
-tbuffer_load_format_d16_x v4, off, s[12:15], m0, format:2 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe8,0x00,0x04,0x03,0x7d]
-
-tbuffer_load_format_d16_x v4, off, s[8:11], 0, format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_SNORM] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe8,0x00,0x04,0x02,0x80]
-
-tbuffer_load_format_d16_x v4, off, s[8:11], 61, format:[BUF_FMT_8_USCALED] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe8,0x00,0x04,0x02,0xbd]
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], 61, format:3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe8,0x00,0x04,0x1c,0xbd]
-
-tbuffer_load_format_d16_x v4, v1, s[8:11], s3, format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_USCALED] offen offset:52
-// GFX11: encoding: [0x34,0x00,0x1c,0xe8,0x01,0x04,0x42,0x03]
-
-tbuffer_load_format_d16_x v4, v1, s[8:11], s3, format:[BUF_FMT_8_SSCALED] idxen offset:52
-// GFX11: encoding: [0x34,0x00,0x24,0xe8,0x01,0x04,0x82,0x03]
-
-tbuffer_load_format_d16_x v4, v[1:2], s[8:11], s0, format:4 idxen offen offset:52
-// GFX11: encoding: [0x34,0x00,0x24,0xe8,0x01,0x04,0xc2,0x00]
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_SSCALED] offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x24,0xe8,0x00,0x04,0x1c,0x03]
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], s3, format:[BUF_FMT_8_UINT] offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x2c,0xe8,0x00,0x04,0x1c,0x03]
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], s3, format:5 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x2c,0xe8,0x00,0x04,0x1c,0x03]
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_UINT] offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x2c,0xe8,0x00,0x04,0x1c,0x03]
+// GFX11: tbuffer_load_d16_format_x v4, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe8,0x00,0x04,0x02,0x03]
 
 tbuffer_load_format_d16_xy v4, off, s[8:11], s3, format:[BUF_FMT_8_SINT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x34,0xe8,0x00,0x04,0x02,0x03]
-
-tbuffer_load_format_d16_xy v255, off, s[8:11], s3, format:6 offset:4095
-// GFX11: encoding: [0xff,0x8f,0x34,0xe8,0x00,0xff,0x02,0x03]
-
-tbuffer_load_format_d16_xy v4, off, s[12:15], s3, format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_SINT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x34,0xe8,0x00,0x04,0x03,0x03]
-
-tbuffer_load_format_d16_xy v4, off, s[12:15], s101, format:[BUF_FMT_16_UNORM] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x3c,0xe8,0x00,0x04,0x03,0x65]
-
-tbuffer_load_format_d16_xy v4, off, s[12:15], m0, format:7 offset:4095
-// GFX11: encoding: [0xff,0x8f,0x3c,0xe8,0x00,0x04,0x03,0x7d]
-
-tbuffer_load_format_d16_xy v4, off, s[8:11], 0, format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_UNORM] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x3c,0xe8,0x00,0x04,0x02,0x80]
-
-tbuffer_load_format_d16_xy v4, off, s[8:11], 61, format:[BUF_FMT_16_SNORM] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x44,0xe8,0x00,0x04,0x02,0xbd]
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], 61, format:8 offset:4095
-// GFX11: encoding: [0xff,0x8f,0x44,0xe8,0x00,0x04,0x1c,0xbd]
-
-tbuffer_load_format_d16_xy v4, v1, s[8:11], s3, format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_SNORM] offen offset:52
-// GFX11: encoding: [0x34,0x80,0x44,0xe8,0x01,0x04,0x42,0x03]
-
-tbuffer_load_format_d16_xy v4, v1, s[8:11], s3, format:[BUF_FMT_16_USCALED] idxen offset:52
-// GFX11: encoding: [0x34,0x80,0x4c,0xe8,0x01,0x04,0x82,0x03]
-
-tbuffer_load_format_d16_xy v4, v[1:2], s[8:11], s0, format:9 idxen offen offset:52
-// GFX11: encoding: [0x34,0x80,0x4c,0xe8,0x01,0x04,0xc2,0x00]
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_USCALED] offset:4095 glc
-// GFX11: encoding: [0xff,0xcf,0x4c,0xe8,0x00,0x04,0x1c,0x03]
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], s3, format:[BUF_FMT_16_SSCALED] offset:4095 slc
-// GFX11: encoding: [0xff,0x9f,0x54,0xe8,0x00,0x04,0x1c,0x03]
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], s3, format:10 offset:4095 dlc
-// GFX11: encoding: [0xff,0xaf,0x54,0xe8,0x00,0x04,0x1c,0x03]
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_SSCALED] offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0xff,0x54,0xe8,0x00,0x04,0x1c,0x03]
+// GFX11: tbuffer_load_d16_format_xy v4, off, s[8:11], s3 format:[BUF_FMT_8_SINT] offset:4095 ; encoding: [0xff,0x8f,0x34,0xe8,0x00,0x04,0x02,0x03]
 
 tbuffer_load_format_d16_xyz v[4:5], off, s[8:11], s3, format:[BUF_FMT_16_UINT] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5d,0xe8,0x00,0x04,0x02,0x03]
-
-tbuffer_load_format_d16_xyz v[254:255], off, s[8:11], s3, format:11 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5d,0xe8,0x00,0xfe,0x02,0x03]
-
-tbuffer_load_format_d16_xyz v[4:5], off, s[12:15], s3, format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_UINT] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5d,0xe8,0x00,0x04,0x03,0x03]
-
-tbuffer_load_format_d16_xyz v[4:5], off, s[12:15], s101, format:[BUF_FMT_16_SINT] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x65,0xe8,0x00,0x04,0x03,0x65]
-
-tbuffer_load_format_d16_xyz v[4:5], off, s[12:15], m0, format:12 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x65,0xe8,0x00,0x04,0x03,0x7d]
-
-tbuffer_load_format_d16_xyz v[4:5], off, s[8:11], 0, format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_SINT] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x65,0xe8,0x00,0x04,0x02,0x80]
-
-tbuffer_load_format_d16_xyz v[4:5], off, s[8:11], 61, format:[BUF_FMT_16_FLOAT] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x6d,0xe8,0x00,0x04,0x02,0xbd]
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], 61, format:13 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x6d,0xe8,0x00,0x04,0x1c,0xbd]
-
-tbuffer_load_format_d16_xyz v[4:5], v1, s[8:11], s3, format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_FLOAT] offen offset:52
-// GFX11: encoding: [0x34,0x00,0x6d,0xe8,0x01,0x04,0x42,0x03]
-
-tbuffer_load_format_d16_xyz v[4:5], v1, s[8:11], s3, format:[BUF_FMT_8_8_UNORM] idxen offset:52
-// GFX11: encoding: [0x34,0x00,0x75,0xe8,0x01,0x04,0x82,0x03]
-
-tbuffer_load_format_d16_xyz v[4:5], v[1:2], s[8:11], s0, format:14 idxen offen offset:52
-// GFX11: encoding: [0x34,0x00,0x75,0xe8,0x01,0x04,0xc2,0x00]
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_UNORM] offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x75,0xe8,0x00,0x04,0x1c,0x03]
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], s3, format:[BUF_FMT_8_8_SNORM] offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x7d,0xe8,0x00,0x04,0x1c,0x03]
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], s3, format:15 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x7d,0xe8,0x00,0x04,0x1c,0x03]
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_SNORM] offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x7d,0xe8,0x00,0x04,0x1c,0x03]
+// GFX11: tbuffer_load_d16_format_xyz v[4:5], off, s[8:11], s3 format:[BUF_FMT_16_UINT] offset:4095 ; encoding: [0xff,0x0f,0x5d,0xe8,0x00,0x04,0x02,0x03]
 
 tbuffer_load_format_d16_xyzw v[4:5], off, s[8:11], s3, format:[BUF_FMT_8_8_USCALED] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x85,0xe8,0x00,0x04,0x02,0x03]
-
-tbuffer_load_format_d16_xyzw v[254:255], off, s[8:11], s3, format:16 offset:4095
-// GFX11: encoding: [0xff,0x8f,0x85,0xe8,0x00,0xfe,0x02,0x03]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, s[12:15], s3, format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_USCALED] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x85,0xe8,0x00,0x04,0x03,0x03]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, s[12:15], s101, format:[BUF_FMT_8_8_SSCALED] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x8d,0xe8,0x00,0x04,0x03,0x65]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, s[12:15], m0, format:17 offset:4095
-// GFX11: encoding: [0xff,0x8f,0x8d,0xe8,0x00,0x04,0x03,0x7d]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, s[8:11], 0, format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_SSCALED] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x8d,0xe8,0x00,0x04,0x02,0x80]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, s[8:11], 61, format:[BUF_FMT_8_8_UINT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x95,0xe8,0x00,0x04,0x02,0xbd]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], 61, format:18 offset:4095
-// GFX11: encoding: [0xff,0x8f,0x95,0xe8,0x00,0x04,0x1c,0xbd]
-
-tbuffer_load_format_d16_xyzw v[4:5], v1, s[8:11], s3, format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_UINT] offen offset:52
-// GFX11: encoding: [0x34,0x80,0x95,0xe8,0x01,0x04,0x42,0x03]
-
-tbuffer_load_format_d16_xyzw v[4:5], v1, s[8:11], s3, format:[BUF_FMT_8_8_SINT] idxen offset:52
-// GFX11: encoding: [0x34,0x80,0x9d,0xe8,0x01,0x04,0x82,0x03]
-
-tbuffer_load_format_d16_xyzw v[4:5], v[1:2], s[8:11], s0, format:19 idxen offen offset:52
-// GFX11: encoding: [0x34,0x80,0x9d,0xe8,0x01,0x04,0xc2,0x00]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_SINT] offset:4095 glc
-// GFX11: encoding: [0xff,0xcf,0x9d,0xe8,0x00,0x04,0x1c,0x03]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], s3, format:[BUF_FMT_32_UINT] offset:4095 slc
-// GFX11: encoding: [0xff,0x9f,0xa5,0xe8,0x00,0x04,0x1c,0x03]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], s3, format:20 offset:4095 dlc
-// GFX11: encoding: [0xff,0xaf,0xa5,0xe8,0x00,0x04,0x1c,0x03]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_32, BUF_NUM_FORMAT_UINT] offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0xff,0xa5,0xe8,0x00,0x04,0x1c,0x03]
+// GFX11: tbuffer_load_d16_format_xyzw v[4:5], off, s[8:11], s3 format:[BUF_FMT_8_8_USCALED] offset:4095 ; encoding: [0xff,0x8f,0x85,0xe8,0x00,0x04,0x02,0x03]
 
 tbuffer_store_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_2_10_10_10_SINT] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4e,0xe9,0x00,0x04,0x02,0x03]
-
-tbuffer_store_format_d16_x v255, off, s[8:11], s3, format:41 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4e,0xe9,0x00,0xff,0x02,0x03]
-
-tbuffer_store_format_d16_x v4, off, s[12:15], s3, format:[BUF_DATA_FORMAT_2_10_10_10, BUF_NUM_FORMAT_SINT] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4e,0xe9,0x00,0x04,0x03,0x03]
-
-tbuffer_store_format_d16_x v4, off, s[12:15], s101, format:[BUF_FMT_8_8_8_8_UNORM] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x56,0xe9,0x00,0x04,0x03,0x65]
-
-tbuffer_store_format_d16_x v4, off, s[12:15], m0, format:42 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x56,0xe9,0x00,0x04,0x03,0x7d]
-
-tbuffer_store_format_d16_x v4, off, s[8:11], 0, format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_UNORM] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x56,0xe9,0x00,0x04,0x02,0x80]
-
-tbuffer_store_format_d16_x v4, off, s[8:11], 61, format:[BUF_FMT_8_8_8_8_SNORM] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5e,0xe9,0x00,0x04,0x02,0xbd]
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], 61, format:43 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5e,0xe9,0x00,0x04,0x1c,0xbd]
-
-tbuffer_store_format_d16_x v4, v1, s[8:11], s3, format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_SNORM] offen offset:52
-// GFX11: encoding: [0x34,0x00,0x5e,0xe9,0x01,0x04,0x42,0x03]
-
-tbuffer_store_format_d16_x v4, v1, s[8:11], s3, format:[BUF_FMT_8_8_8_8_USCALED] idxen offset:52
-// GFX11: encoding: [0x34,0x00,0x66,0xe9,0x01,0x04,0x82,0x03]
-
-tbuffer_store_format_d16_x v4, v[1:2], s[8:11], s0, format:44 idxen offen offset:52
-// GFX11: encoding: [0x34,0x00,0x66,0xe9,0x01,0x04,0xc2,0x00]
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_USCALED] offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x66,0xe9,0x00,0x04,0x1c,0x03]
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], s3, format:[BUF_FMT_8_8_8_8_SSCALED] offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x6e,0xe9,0x00,0x04,0x1c,0x03]
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], s3, format:45 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x6e,0xe9,0x00,0x04,0x1c,0x03]
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_SSCALED] offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x6e,0xe9,0x00,0x04,0x1c,0x03]
+// GFX11: tbuffer_store_d16_format_x v4, off, s[8:11], s3 format:[BUF_FMT_2_10_10_10_SINT] offset:4095 ; encoding: [0xff,0x0f,0x4e,0xe9,0x00,0x04,0x02,0x03]
 
 tbuffer_store_format_d16_xy v4, off, s[8:11], s3, format:[BUF_FMT_8_8_8_8_UINT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x76,0xe9,0x00,0x04,0x02,0x03]
-
-tbuffer_store_format_d16_xy v255, off, s[8:11], s3, format:46 offset:4095
-// GFX11: encoding: [0xff,0x8f,0x76,0xe9,0x00,0xff,0x02,0x03]
-
-tbuffer_store_format_d16_xy v4, off, s[12:15], s3, format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_UINT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x76,0xe9,0x00,0x04,0x03,0x03]
-
-tbuffer_store_format_d16_xy v4, off, s[12:15], s101, format:[BUF_FMT_8_8_8_8_SINT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x7e,0xe9,0x00,0x04,0x03,0x65]
-
-tbuffer_store_format_d16_xy v4, off, s[12:15], m0, format:47 offset:4095
-// GFX11: encoding: [0xff,0x8f,0x7e,0xe9,0x00,0x04,0x03,0x7d]
-
-tbuffer_store_format_d16_xy v4, off, s[8:11], 0, format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_SINT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x7e,0xe9,0x00,0x04,0x02,0x80]
-
-tbuffer_store_format_d16_xy v4, off, s[8:11], 61, format:[BUF_FMT_32_32_UINT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x86,0xe9,0x00,0x04,0x02,0xbd]
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], 61, format:48 offset:4095
-// GFX11: encoding: [0xff,0x8f,0x86,0xe9,0x00,0x04,0x1c,0xbd]
-
-tbuffer_store_format_d16_xy v4, v1, s[8:11], s3, format:[BUF_DATA_FORMAT_32_32, BUF_NUM_FORMAT_UINT] offen offset:52
-// GFX11: encoding: [0x34,0x80,0x86,0xe9,0x01,0x04,0x42,0x03]
-
-tbuffer_store_format_d16_xy v4, v1, s[8:11], s3, format:[BUF_FMT_32_32_SINT] idxen offset:52
-// GFX11: encoding: [0x34,0x80,0x8e,0xe9,0x01,0x04,0x82,0x03]
-
-tbuffer_store_format_d16_xy v4, v[1:2], s[8:11], s0, format:49 idxen offen offset:52
-// GFX11: encoding: [0x34,0x80,0x8e,0xe9,0x01,0x04,0xc2,0x00]
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_32_32, BUF_NUM_FORMAT_SINT] offset:4095 glc
-// GFX11: encoding: [0xff,0xcf,0x8e,0xe9,0x00,0x04,0x1c,0x03]
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], s3, format:[BUF_FMT_32_32_FLOAT] offset:4095 slc
-// GFX11: encoding: [0xff,0x9f,0x96,0xe9,0x00,0x04,0x1c,0x03]
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], s3, format:50 offset:4095 dlc
-// GFX11: encoding: [0xff,0xaf,0x96,0xe9,0x00,0x04,0x1c,0x03]
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_32_32, BUF_NUM_FORMAT_FLOAT] offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0xff,0x96,0xe9,0x00,0x04,0x1c,0x03]
+// GFX11: tbuffer_store_d16_format_xy v4, off, s[8:11], s3 format:[BUF_FMT_8_8_8_8_UINT] offset:4095 ; encoding: [0xff,0x8f,0x76,0xe9,0x00,0x04,0x02,0x03]
 
 tbuffer_store_format_d16_xyz v[4:5], off, s[8:11], s3, format:[BUF_FMT_16_16_16_16_UNORM] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x9f,0xe9,0x00,0x04,0x02,0x03]
-
-tbuffer_store_format_d16_xyz v[254:255], off, s[8:11], s3, format:51 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x9f,0xe9,0x00,0xfe,0x02,0x03]
-
-tbuffer_store_format_d16_xyz v[4:5], off, s[12:15], s3, format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_UNORM] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x9f,0xe9,0x00,0x04,0x03,0x03]
-
-tbuffer_store_format_d16_xyz v[4:5], off, s[12:15], s101, format:[BUF_FMT_16_16_16_16_SNORM] offset:4095
-// GFX11: encoding: [0xff,0x0f,0xa7,0xe9,0x00,0x04,0x03,0x65]
-
-tbuffer_store_format_d16_xyz v[4:5], off, s[12:15], m0, format:52 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xa7,0xe9,0x00,0x04,0x03,0x7d]
-
-tbuffer_store_format_d16_xyz v[4:5], off, s[8:11], 0, format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_SNORM] offset:4095
-// GFX11: encoding: [0xff,0x0f,0xa7,0xe9,0x00,0x04,0x02,0x80]
-
-tbuffer_store_format_d16_xyz v[4:5], off, s[8:11], 61, format:[BUF_FMT_16_16_16_16_USCALED] offset:4095
-// GFX11: encoding: [0xff,0x0f,0xaf,0xe9,0x00,0x04,0x02,0xbd]
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], 61, format:53 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xaf,0xe9,0x00,0x04,0x1c,0xbd]
-
-tbuffer_store_format_d16_xyz v[4:5], v1, s[8:11], s3, format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_USCALED] offen offset:52
-// GFX11: encoding: [0x34,0x00,0xaf,0xe9,0x01,0x04,0x42,0x03]
-
-tbuffer_store_format_d16_xyz v[4:5], v1, s[8:11], s3, format:[BUF_FMT_16_16_16_16_SSCALED] idxen offset:52
-// GFX11: encoding: [0x34,0x00,0xb7,0xe9,0x01,0x04,0x82,0x03]
-
-tbuffer_store_format_d16_xyz v[4:5], v[1:2], s[8:11], s0, format:54 idxen offen offset:52
-// GFX11: encoding: [0x34,0x00,0xb7,0xe9,0x01,0x04,0xc2,0x00]
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_SSCALED] offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xb7,0xe9,0x00,0x04,0x1c,0x03]
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], s3, format:[BUF_FMT_16_16_16_16_UINT] offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xbf,0xe9,0x00,0x04,0x1c,0x03]
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], s3, format:55 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0xbf,0xe9,0x00,0x04,0x1c,0x03]
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_UINT] offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0xbf,0xe9,0x00,0x04,0x1c,0x03]
+// GFX11: tbuffer_store_d16_format_xyz v[4:5], off, s[8:11], s3 format:[BUF_FMT_16_16_16_16_UNORM] offset:4095 ; encoding: [0xff,0x0f,0x9f,0xe9,0x00,0x04,0x02,0x03]
 
 tbuffer_store_format_d16_xyzw v[4:5], off, s[8:11], s3, format:[BUF_FMT_16_16_16_16_SINT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0xc7,0xe9,0x00,0x04,0x02,0x03]
-
-tbuffer_store_format_d16_xyzw v[254:255], off, s[8:11], s3, format:56 offset:4095
-// GFX11: encoding: [0xff,0x8f,0xc7,0xe9,0x00,0xfe,0x02,0x03]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, s[12:15], s3, format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_SINT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0xc7,0xe9,0x00,0x04,0x03,0x03]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, s[12:15], s101, format:[BUF_FMT_16_16_16_16_FLOAT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0xcf,0xe9,0x00,0x04,0x03,0x65]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, s[12:15], m0, format:57 offset:4095
-// GFX11: encoding: [0xff,0x8f,0xcf,0xe9,0x00,0x04,0x03,0x7d]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, s[8:11], 0, format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_FLOAT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0xcf,0xe9,0x00,0x04,0x02,0x80]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, s[8:11], 61, format:[BUF_FMT_32_32_32_UINT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0xd7,0xe9,0x00,0x04,0x02,0xbd]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], 61, format:58 offset:4095
-// GFX11: encoding: [0xff,0x8f,0xd7,0xe9,0x00,0x04,0x1c,0xbd]
-
-tbuffer_store_format_d16_xyzw v[4:5], v1, s[8:11], s3, format:[BUF_DATA_FORMAT_32_32_32, BUF_NUM_FORMAT_UINT] offen offset:52
-// GFX11: encoding: [0x34,0x80,0xd7,0xe9,0x01,0x04,0x42,0x03]
-
-tbuffer_store_format_d16_xyzw v[4:5], v1, s[8:11], s3, format:[BUF_FMT_32_32_32_SINT] idxen offset:52
-// GFX11: encoding: [0x34,0x80,0xdf,0xe9,0x01,0x04,0x82,0x03]
-
-tbuffer_store_format_d16_xyzw v[4:5], v[1:2], s[8:11], s0, format:59 idxen offen offset:52
-// GFX11: encoding: [0x34,0x80,0xdf,0xe9,0x01,0x04,0xc2,0x00]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_32_32_32, BUF_NUM_FORMAT_SINT] offset:4095 glc
-// GFX11: encoding: [0xff,0xcf,0xdf,0xe9,0x00,0x04,0x1c,0x03]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], s3, format:[BUF_FMT_32_32_32_FLOAT] offset:4095 slc
-// GFX11: encoding: [0xff,0x9f,0xe7,0xe9,0x00,0x04,0x1c,0x03]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], s3, format:60 offset:4095 dlc
-// GFX11: encoding: [0xff,0xaf,0xe7,0xe9,0x00,0x04,0x1c,0x03]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_32_32_32, BUF_NUM_FORMAT_FLOAT] offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0xff,0xe7,0xe9,0x00,0x04,0x1c,0x03]
-
-//Removed formats (compared to gfx10)
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_10_11_11_UNORM] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_10_11_11_SNORM] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_10_11_11_USCALED] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_10_11_11_SSCALED] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_10_11_11_UINT] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_10_11_11_SINT] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_11_11_10_UNORM] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_11_11_10_SNORM] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_11_11_10_USCALED] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_11_11_10_SSCALED] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_11_11_10_UINT] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_10_10_10_2_USCALED] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_10_10_10_2_SSCALED] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
+// GFX11: tbuffer_store_d16_format_xyzw v[4:5], off, s[8:11], s3 format:[BUF_FMT_16_16_16_16_SINT] offset:4095 ; encoding: [0xff,0x8f,0xc7,0xe9,0x00,0x04,0x02,0x03]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_mubuf_alias.s b/llvm/test/MC/AMDGPU/gfx11_asm_mubuf_alias.s
index 9cf7503e5f37..c65dc4dbe032 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_mubuf_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_mubuf_alias.s
@@ -1,3453 +1,187 @@
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX11-ERR --implicit-check-not=error: %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
 
 buffer_load_dword v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dword v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_dword v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_dword v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_dword v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_dword v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_dword v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_dword v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_dword v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_dword v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_dword v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_dword v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_dword v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x50,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dword v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x50,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dword v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x50,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dword v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x50,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dword v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x50,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dword v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x50,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dword v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x50,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dword v5, off, s[8:11], s3 offset:4095 lds
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX11: buffer_load_b32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0xfe,0x02,0x03]
-
-buffer_load_dwordx2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_dwordx2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_dwordx2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_dwordx2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x54,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x54,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x54,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x54,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x54,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x54,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx2 v[5:7], off, s[8:11], s3 offset:4095 dlc tfe
-// GFX11: encoding: [0xff,0x2f,0x54,0xe0,0x00,0x05,0x22,0x03]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x54,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_b64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx3 v[253:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0xfd,0x02,0x03]
-
-buffer_load_dwordx3 v[5:7], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_dwordx3 v[5:7], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_dwordx3 v[5:7], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_dwordx3 v[5:7], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x58,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x58,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x58,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x58,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x58,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x58,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x58,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_b96 v[5:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx4 v[252:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0xfc,0x02,0x03]
-
-buffer_load_dwordx4 v[5:8], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_dwordx4 v[5:8], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_dwordx4 v[5:8], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_dwordx4 v[5:8], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x5c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x5c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x5c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x5c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x5c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x5c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x5c,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_b128 v[5:8], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_short_d16 v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16 v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_short_d16 v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_short_d16 v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_short_d16 v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_short_d16 v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_short_d16 v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_short_d16 v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_short_d16 v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_short_d16 v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_short_d16 v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_short_d16 v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_short_d16 v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x80,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16 v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x80,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16 v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x80,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16 v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x80,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16 v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x80,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16 v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x80,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16 v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x80,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_d16_b16 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_format_d16_x v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_x v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_format_d16_x v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_format_d16_x v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_format_d16_x v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_format_d16_x v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_format_d16_x v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_format_d16_x v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_format_d16_x v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_format_d16_x v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_format_d16_x v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_format_d16_x v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_format_d16_x v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x20,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_x v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x20,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_x v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x20,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_x v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x20,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_x v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x20,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_x v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x20,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_x v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x20,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_d16_format_x v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_format_d16_xy v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xy v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_format_d16_xy v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_format_d16_xy v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_format_d16_xy v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_format_d16_xy v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_format_d16_xy v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_format_d16_xy v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_format_d16_xy v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_format_d16_xy v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_format_d16_xy v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x24,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x24,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x24,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x24,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x24,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x24,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x24,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_d16_format_xy v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyz v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0xfe,0x02,0x03]
-
-buffer_load_format_d16_xyz v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_format_d16_xyz v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_format_d16_xyz v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_format_d16_xyz v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x28,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x28,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x28,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x28,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x28,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x28,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x28,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_d16_format_xyz v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyzw v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0xfe,0x02,0x03]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_format_d16_xyzw v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_format_d16_xyzw v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x2c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x2c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x2c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x2c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x2c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x2c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x2c,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_d16_format_xyzw v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_short_d16_hi v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16_hi v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_short_d16_hi v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_short_d16_hi v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_short_d16_hi v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_short_d16_hi v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_short_d16_hi v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_short_d16_hi v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_short_d16_hi v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_short_d16_hi v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_short_d16_hi v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x8c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x8c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x8c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x8c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x8c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x8c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x8c,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_d16_hi_b16 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_format_d16_hi_x v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_hi_x v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_format_d16_hi_x v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_format_d16_hi_x v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_format_d16_hi_x v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_format_d16_hi_x v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x98,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x98,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x98,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x98,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x98,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x98,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x98,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_d16_hi_format_x v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_sbyte_d16_hi v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16_hi v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_sbyte_d16_hi v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_sbyte_d16_hi v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_sbyte_d16_hi v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_sbyte_d16_hi v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x88,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x88,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x88,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x88,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x88,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x88,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x88,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_d16_hi_i8 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_ubyte_d16_hi v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16_hi v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_ubyte_d16_hi v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_ubyte_d16_hi v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_ubyte_d16_hi v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_ubyte_d16_hi v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x84,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x84,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x84,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x84,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x84,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x84,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x84,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_d16_hi_u8 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_sbyte_d16 v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16 v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_sbyte_d16 v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_sbyte_d16 v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_sbyte_d16 v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_sbyte_d16 v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x7c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x7c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x7c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x7c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x7c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x7c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x7c,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_d16_i8 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_ubyte_d16 v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16 v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_ubyte_d16 v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_ubyte_d16 v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_ubyte_d16 v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_ubyte_d16 v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x78,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x78,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x78,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x78,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x78,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x78,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x78,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_d16_u8 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_sbyte v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_sbyte v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_sbyte v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_sbyte v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_sbyte v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_sbyte v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_sbyte v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_sbyte v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_sbyte v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_sbyte v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_sbyte v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_sbyte v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x44,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x44,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x44,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x44,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x44,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x44,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x44,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_i8 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_sshort v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sshort v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_sshort v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_sshort v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_sshort v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_sshort v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_sshort v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_sshort v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_sshort v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_sshort v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_sshort v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_sshort v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_sshort v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x4c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sshort v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x4c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sshort v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x4c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sshort v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x4c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sshort v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x4c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sshort v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x4c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sshort v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x4c,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_i16 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_ubyte v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_ubyte v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_ubyte v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_ubyte v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_ubyte v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_ubyte v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_ubyte v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_ubyte v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_ubyte v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_ubyte v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_ubyte v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_ubyte v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x40,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x40,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x40,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x40,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x40,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x40,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x40,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_u8 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_ushort v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ushort v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_ushort v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_ushort v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_ushort v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_ushort v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_ushort v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_ushort v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_ushort v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_ushort v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_ushort v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_ushort v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_ushort v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x48,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ushort v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x48,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ushort v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x48,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ushort v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x48,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ushort v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x48,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ushort v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x48,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ushort v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x48,0xe0,0x00,0x05,0x02,0x03]
-
+// GFX11: buffer_load_u16 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_store_byte v1, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte v255, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0xff,0x03,0x04]
-
-buffer_store_byte v1, off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_byte v1, off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_byte v1, off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_byte v1, off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_byte v1, off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_byte v1, off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_byte v1, off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_byte v1, off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_byte v1, v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_byte v1, v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_byte v1, off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x60,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte v1, off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x60,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte v1, off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x60,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte v1, off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x60,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte v1, off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x60,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte v1, off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x60,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte v1, off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x60,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_b8 v1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_store_short v1, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short v255, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0xff,0x03,0x04]
-
-buffer_store_short v1, off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_short v1, off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_short v1, off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_short v1, off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_short v1, off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_short v1, off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_short v1, off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_short v1, off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_short v1, v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_short v1, v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_short v1, off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x64,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short v1, off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x64,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short v1, off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x64,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short v1, off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x64,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short v1, off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x64,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short v1, off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x64,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short v1, off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x64,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_b16 v1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_store_dword v1, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dword v255, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0xff,0x03,0x04]
-
-buffer_store_dword v1, off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_dword v1, off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_dword v1, off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_dword v1, off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_dword v1, off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_dword v1, off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_dword v1, off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_dword v1, off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_dword v1, v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_dword v1, v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_dword v1, off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x68,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dword v1, off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x68,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dword v1, off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x68,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dword v1, off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x68,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dword v1, off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x68,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dword v1, off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x68,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dword v1, off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x68,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_b32 v1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx2 v[254:255], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0xfe,0x03,0x04]
-
-buffer_store_dwordx2 v[1:2], off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_dwordx2 v[1:2], off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_dwordx2 v[1:2], v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_dwordx2 v[1:2], v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x6c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x6c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x6c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x6c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x6c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x6c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x6c,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_b64 v[1:2], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx3 v[253:255], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0xfd,0x03,0x04]
-
-buffer_store_dwordx3 v[1:3], off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_dwordx3 v[1:3], off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_dwordx3 v[1:3], v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_dwordx3 v[1:3], v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x70,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x70,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x70,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x70,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x70,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x70,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x70,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_b96 v[1:3], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx4 v[252:255], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0xfc,0x03,0x04]
-
-buffer_store_dwordx4 v[1:4], off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_dwordx4 v[1:4], off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_dwordx4 v[1:4], v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_dwordx4 v[1:4], v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x74,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x74,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x74,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x74,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x74,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x74,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x74,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_b128 v[1:4], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_store_format_d16_x v1, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_x v255, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0xff,0x03,0x04]
-
-buffer_store_format_d16_x v1, off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_format_d16_x v1, off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_format_d16_x v1, off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_format_d16_x v1, off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_format_d16_x v1, off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_format_d16_x v1, off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_format_d16_x v1, off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_format_d16_x v1, off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_format_d16_x v1, v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_format_d16_x v1, v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_format_d16_x v1, off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x30,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_x v1, off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x30,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_x v1, off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x30,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_x v1, off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x30,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_x v1, off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x30,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_x v1, off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x30,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_x v1, off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x30,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_d16_format_x v1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_store_format_d16_xy v1, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xy v255, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0xff,0x03,0x04]
-
-buffer_store_format_d16_xy v1, off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_format_d16_xy v1, off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_format_d16_xy v1, off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_format_d16_xy v1, off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_format_d16_xy v1, off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_format_d16_xy v1, off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_format_d16_xy v1, off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_format_d16_xy v1, v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_format_d16_xy v1, v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x34,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x34,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x34,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x34,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x34,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x34,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x34,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_d16_format_xy v1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyz v[254:255], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0xfe,0x03,0x04]
-
-buffer_store_format_d16_xyz v[1:2], off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_format_d16_xyz v[1:2], off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_format_d16_xyz v[1:2], v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_format_d16_xyz v[1:2], v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x38,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x38,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x38,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x38,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x38,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x38,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x38,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_d16_format_xyz v[1:2], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyzw v[254:255], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0xfe,0x03,0x04]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_format_d16_xyzw v[1:2], v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_format_d16_xyzw v[1:2], v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x3c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x3c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x3c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x3c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x3c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x3c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x3c,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_d16_format_xyzw v[1:2], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_store_byte_d16_hi v1, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte_d16_hi v255, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0xff,0x03,0x04]
-
-buffer_store_byte_d16_hi v1, off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_byte_d16_hi v1, off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_byte_d16_hi v1, v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_byte_d16_hi v1, v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x90,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x90,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x90,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x90,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x90,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x90,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x90,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_d16_hi_b8 v1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_store_short_d16_hi v1, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short_d16_hi v255, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0xff,0x03,0x04]
-
-buffer_store_short_d16_hi v1, off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_short_d16_hi v1, off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_short_d16_hi v1, off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_short_d16_hi v1, off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_short_d16_hi v1, off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_short_d16_hi v1, off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_short_d16_hi v1, off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_short_d16_hi v1, v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_short_d16_hi v1, v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x94,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x94,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x94,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x94,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x94,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x94,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x94,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_d16_hi_b16 v1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_store_format_d16_hi_x v1, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_hi_x v255, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0xff,0x03,0x04]
-
-buffer_store_format_d16_hi_x v1, off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_format_d16_hi_x v1, off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_format_d16_hi_x v1, v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_format_d16_hi_x v1, v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x9c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x9c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x9c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x9c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x9c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x9c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x9c,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_d16_hi_format_x v1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_atomic_add v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_add v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_add v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_add v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_add v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_add v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_add v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_add v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_add v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_add v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_add v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_add v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0xd4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0xd4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0xd4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xd4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xd4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xd4,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_add_u32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add_x2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_add_x2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_add_x2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_add_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_add_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x0c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x0c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x0c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x0c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x0c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x0c,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_add_u64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_and v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_and v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_and v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_and v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_and v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_and v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_and v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_and v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_and v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_and v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_and v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_and v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0xf0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0xf0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0xf0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xf0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xf0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xf0,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_and_b32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and_x2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_and_x2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_and_x2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_and_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_and_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x24,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x24,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x24,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x24,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x24,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x24,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_and_b64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_cmpswap v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_cmpswap v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_cmpswap v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_cmpswap v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0xd0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0xd0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0xd0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xd0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xd0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xd0,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_cmpswap_b32 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap_x2 v[252:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0xfc,0x02,0x03]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_cmpswap_x2 v[5:8], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_cmpswap_x2 v[5:8], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x08,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x08,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x08,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x08,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x08,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x08,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_cmpswap_b64 v[5:8], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_fcmpswap v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fcmpswap v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_fcmpswap v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_fcmpswap v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_fcmpswap v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_fcmpswap v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x40,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x40,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x40,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x40,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x40,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x40,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_cmpswap_f32 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_csub v255, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_csub v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xdc,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_csub v5, off, s[12:15], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_csub v5, off, s[96:99], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_csub v5, off, s[8:11], s101 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_csub v5, off, s[8:11], m0 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_csub v5, off, s[8:11], 0 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_csub v5, off, s[8:11], -1 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_csub v5, off, s[8:11], 0.5 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_csub v5, off, s[8:11], -4.0 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_csub v5, v0, s[8:11], s3 idxen offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_csub v5, v0, s[8:11], s3 offen offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_csub v5, off, s[8:11], s3 glc
-// GFX11: encoding: [0x00,0x40,0xdc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_csub v5, off, s[8:11], s3 offset:0 glc
-// GFX11: encoding: [0x00,0x40,0xdc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_csub v5, off, s[8:11], s3 offset:7 glc
-// GFX11: encoding: [0x07,0x40,0xdc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_csub v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_csub v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xdc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_csub v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xdc,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_csub_u32 v255, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0xdc,0xe0,0x00,0xff,0x02,0x03]
 
 buffer_atomic_dec v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0xff,0x02,0x03]
-
-buffer_atomic_dec v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_dec v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_dec v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_dec v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_dec v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_dec v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_dec v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_dec v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_dec v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_dec v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_dec v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x00,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x00,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x00,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x00,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x00,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x00,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_dec_u32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec_x2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_dec_x2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_dec_x2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_dec_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_dec_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x34,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x34,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x34,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x34,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x34,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x34,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_dec_u64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_inc v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_inc v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_inc v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_inc v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_inc v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_inc v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_inc v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_inc v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_inc v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_inc v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_inc v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_inc v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0xfc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0xfc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0xfc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xfc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xfc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xfc,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_inc_u32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc_x2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_inc_x2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_inc_x2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_inc_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_inc_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x30,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x30,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x30,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x30,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x30,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x30,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_inc_u64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_fmax v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmax v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0xff,0x02,0x03]
-
-buffer_atomic_fmax v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_fmax v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_fmax v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_fmax v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_fmax v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_fmax v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_fmax v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_fmax v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_fmax v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_fmax v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_fmax v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x48,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmax v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x48,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmax v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x48,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmax v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x48,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmax v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x48,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmax v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x48,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_max_f32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_smax v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_smax v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_smax v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_smax v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_smax v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_smax v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_smax v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_smax v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_smax v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_smax v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_smax v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_smax v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0xe8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0xe8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0xe8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xe8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xe8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xe8,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_max_i32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax_x2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_smax_x2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_smax_x2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_smax_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_smax_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x1c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x1c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x1c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x1c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x1c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x1c,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_max_i64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_umax v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_umax v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_umax v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_umax v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_umax v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_umax v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_umax v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_umax v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_umax v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_umax v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_umax v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_umax v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0xec,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0xec,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0xec,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xec,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xec,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xec,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_max_u32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax_x2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_umax_x2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_umax_x2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_umax_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_umax_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x20,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x20,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x20,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x20,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x20,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x20,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_max_u64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_fmin v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmin v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0xff,0x02,0x03]
-
-buffer_atomic_fmin v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_fmin v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_fmin v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_fmin v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_fmin v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_fmin v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_fmin v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_fmin v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_fmin v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_fmin v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_fmin v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x44,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmin v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x44,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmin v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x44,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmin v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x44,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmin v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x44,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmin v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x44,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_smin v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smin v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_smin v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_smin v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_smin v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_smin v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_smin v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_smin v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_smin v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_smin v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_smin v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_smin v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_smin v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0xe0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smin v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0xe0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smin v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0xe0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smin v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xe0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smin v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xe0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smin v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xe0,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_min_i32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smin_x2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_smin_x2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_smin_x2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_smin_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_smin_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x14,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x14,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x14,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x14,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x14,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x14,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_min_i64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_umin v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umin v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_umin v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_umin v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_umin v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_umin v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_umin v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_umin v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_umin v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_umin v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_umin v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_umin v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_umin v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0xe4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umin v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0xe4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umin v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0xe4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umin v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xe4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umin v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xe4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umin v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xe4,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_min_u32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umin_x2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_umin_x2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_umin_x2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_umin_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_umin_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x18,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x18,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x18,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x18,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x18,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x18,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_min_u64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_or v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_or v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_or v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_or v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_or v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_or v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_or v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_or v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_or v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_or v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_or v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_or v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_or v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0xf4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_or v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0xf4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_or v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0xf4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_or v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xf4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_or v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xf4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_or v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xf4,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_or_b32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_or_x2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_or_x2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_or_x2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_or_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_or_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x28,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x28,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x28,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x28,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x28,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x28,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_or_b64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_sub v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_sub v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_sub v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_sub v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_sub v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_sub v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_sub v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_sub v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_sub v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_sub v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_sub v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_sub v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_sub v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0xd8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_sub v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0xd8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_sub v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0xd8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_sub v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xd8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_sub v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xd8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_sub v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xd8,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_sub_u32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_sub_x2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_sub_x2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_sub_x2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_sub_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_sub_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x10,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x10,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x10,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x10,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x10,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x10,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_sub_u64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_swap v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_swap v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_swap v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_swap v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_swap v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_swap v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_swap v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_swap v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_swap v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_swap v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_swap v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_swap v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_swap v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0xcc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_swap v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0xcc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_swap v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0xcc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_swap v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xcc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_swap v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xcc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_swap v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xcc,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_swap_b32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_swap_x2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_swap_x2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_swap_x2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_swap_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_swap_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x04,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x04,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x04,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x04,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x04,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x04,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_swap_b64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_xor v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_xor v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_xor v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_xor v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_xor v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_xor v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_xor v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_xor v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_xor v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_xor v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_xor v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_xor v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_xor v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0xf8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_xor v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0xf8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_xor v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0xf8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_xor v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xf8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_xor v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xf8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_xor v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xf8,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_xor_b32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_xor_x2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_xor_x2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_xor_x2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_xor_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_xor_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x2c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x2c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x2c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x2c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x2c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x2c,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_xor_b64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x02,0x03]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_smem_alias.s b/llvm/test/MC/AMDGPU/gfx11_asm_smem_alias.s
index 933193194090..1d0facd3d3ff 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_smem_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_smem_alias.s
@@ -1,452 +1,31 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s
 
-//===----------------------------------------------------------------------===//
-// ENC_SMEM.
-//===----------------------------------------------------------------------===//
-
 s_load_dword s5, s[2:3], s0
 // GFX11: s_load_b32 s5, s[2:3], s0               ; encoding: [0x41,0x01,0x00,0xf4,0x00,0x00,0x00,0x00]
 
-s_load_dword s101, s[2:3], s0
-// GFX11: s_load_b32 s101, s[2:3], s0             ; encoding: [0x41,0x19,0x00,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dword vcc_lo, s[2:3], s0
-// GFX11: s_load_b32 vcc_lo, s[2:3], s0           ; encoding: [0x81,0x1a,0x00,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dword vcc_hi, s[2:3], s0
-// GFX11: s_load_b32 vcc_hi, s[2:3], s0           ; encoding: [0xc1,0x1a,0x00,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dword s5, s[4:5], s0
-// GFX11: s_load_b32 s5, s[4:5], s0               ; encoding: [0x42,0x01,0x00,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dword s5, s[100:101], s0
-// GFX11: s_load_b32 s5, s[100:101], s0           ; encoding: [0x72,0x01,0x00,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dword s5, vcc, s0
-// GFX11: s_load_b32 s5, vcc, s0                  ; encoding: [0x75,0x01,0x00,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dword s5, s[2:3], s101
-// GFX11: s_load_b32 s5, s[2:3], s101             ; encoding: [0x41,0x01,0x00,0xf4,0x00,0x00,0x00,0xca]
-
-s_load_dword s5, s[2:3], vcc_lo
-// GFX11: s_load_b32 s5, s[2:3], vcc_lo           ; encoding: [0x41,0x01,0x00,0xf4,0x00,0x00,0x00,0xd4]
-
-s_load_dword s5, s[2:3], vcc_hi
-// GFX11: s_load_b32 s5, s[2:3], vcc_hi           ; encoding: [0x41,0x01,0x00,0xf4,0x00,0x00,0x00,0xd6]
-
-s_load_dword s5, s[2:3], m0
-// GFX11: s_load_b32 s5, s[2:3], m0               ; encoding: [0x41,0x01,0x00,0xf4,0x00,0x00,0x00,0xfa]
-
-s_load_dword s5, s[2:3], 0x0
-// GFX11: s_load_b32 s5, s[2:3], 0x0              ; encoding: [0x41,0x01,0x00,0xf4,0x00,0x00,0x00,0xf8]
-
-s_load_dword s5, s[2:3], s0 glc
-// GFX11: s_load_b32 s5, s[2:3], s0 glc           ; encoding: [0x41,0x41,0x00,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dword s5, s[2:3], s0 dlc
-// GFX11: s_load_b32 s5, s[2:3], s0 dlc           ; encoding: [0x41,0x21,0x00,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dword s5, s[2:3], s0 glc dlc
-// GFX11: s_load_b32 s5, s[2:3], s0 glc dlc       ; encoding: [0x41,0x61,0x00,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dword s5, s[2:3], 0x1234 glc dlc
-// GFX11: s_load_b32 s5, s[2:3], 0x1234 glc dlc   ; encoding: [0x41,0x61,0x00,0xf4,0x34,0x12,0x00,0xf8]
-
 s_load_dwordx2 s[10:11], s[2:3], s0
 // GFX11: s_load_b64 s[10:11], s[2:3], s0         ; encoding: [0x81,0x02,0x04,0xf4,0x00,0x00,0x00,0x00]
 
-s_load_dwordx2 s[12:13], s[2:3], s0
-// GFX11: s_load_b64 s[12:13], s[2:3], s0         ; encoding: [0x01,0x03,0x04,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx2 s[100:101], s[2:3], s0
-// GFX11: s_load_b64 s[100:101], s[2:3], s0       ; encoding: [0x01,0x19,0x04,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx2 vcc, s[2:3], s0
-// GFX11: s_load_b64 vcc, s[2:3], s0              ; encoding: [0x81,0x1a,0x04,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx2 s[10:11], s[4:5], s0
-// GFX11: s_load_b64 s[10:11], s[4:5], s0         ; encoding: [0x82,0x02,0x04,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx2 s[10:11], s[100:101], s0
-// GFX11: s_load_b64 s[10:11], s[100:101], s0     ; encoding: [0xb2,0x02,0x04,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx2 s[10:11], vcc, s0
-// GFX11: s_load_b64 s[10:11], vcc, s0            ; encoding: [0xb5,0x02,0x04,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx2 s[10:11], s[2:3], s101
-// GFX11: s_load_b64 s[10:11], s[2:3], s101       ; encoding: [0x81,0x02,0x04,0xf4,0x00,0x00,0x00,0xca]
-
-s_load_dwordx2 s[10:11], s[2:3], vcc_lo
-// GFX11: s_load_b64 s[10:11], s[2:3], vcc_lo     ; encoding: [0x81,0x02,0x04,0xf4,0x00,0x00,0x00,0xd4]
-
-s_load_dwordx2 s[10:11], s[2:3], vcc_hi
-// GFX11: s_load_b64 s[10:11], s[2:3], vcc_hi     ; encoding: [0x81,0x02,0x04,0xf4,0x00,0x00,0x00,0xd6]
-
-s_load_dwordx2 s[10:11], s[2:3], m0
-// GFX11: s_load_b64 s[10:11], s[2:3], m0         ; encoding: [0x81,0x02,0x04,0xf4,0x00,0x00,0x00,0xfa]
-
-s_load_dwordx2 s[10:11], s[2:3], 0x0
-// GFX11: s_load_b64 s[10:11], s[2:3], 0x0        ; encoding: [0x81,0x02,0x04,0xf4,0x00,0x00,0x00,0xf8]
-
-s_load_dwordx2 s[10:11], s[2:3], s0 glc
-// GFX11: s_load_b64 s[10:11], s[2:3], s0 glc     ; encoding: [0x81,0x42,0x04,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx2 s[10:11], s[2:3], s0 dlc
-// GFX11: s_load_b64 s[10:11], s[2:3], s0 dlc     ; encoding: [0x81,0x22,0x04,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx2 s[10:11], s[2:3], s0 glc dlc
-// GFX11: s_load_b64 s[10:11], s[2:3], s0 glc dlc ; encoding: [0x81,0x62,0x04,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx2 s[10:11], s[2:3], 0x1234 glc dlc
-// GFX11: s_load_b64 s[10:11], s[2:3], 0x1234 glc dlc ; encoding: [0x81,0x62,0x04,0xf4,0x34,0x12,0x00,0xf8]
-
 s_load_dwordx4 s[20:23], s[2:3], s0
 // GFX11: s_load_b128 s[20:23], s[2:3], s0        ; encoding: [0x01,0x05,0x08,0xf4,0x00,0x00,0x00,0x00]
 
-s_load_dwordx4 s[24:27], s[2:3], s0
-// GFX11: s_load_b128 s[24:27], s[2:3], s0        ; encoding: [0x01,0x06,0x08,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx4 s[96:99], s[2:3], s0
-// GFX11: s_load_b128 s[96:99], s[2:3], s0        ; encoding: [0x01,0x18,0x08,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx4 s[20:23], s[4:5], s0
-// GFX11: s_load_b128 s[20:23], s[4:5], s0        ; encoding: [0x02,0x05,0x08,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx4 s[20:23], s[100:101], s0
-// GFX11: s_load_b128 s[20:23], s[100:101], s0    ; encoding: [0x32,0x05,0x08,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx4 s[20:23], vcc, s0
-// GFX11: s_load_b128 s[20:23], vcc, s0           ; encoding: [0x35,0x05,0x08,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx4 s[20:23], s[2:3], s101
-// GFX11: s_load_b128 s[20:23], s[2:3], s101      ; encoding: [0x01,0x05,0x08,0xf4,0x00,0x00,0x00,0xca]
-
-s_load_dwordx4 s[20:23], s[2:3], vcc_lo
-// GFX11: s_load_b128 s[20:23], s[2:3], vcc_lo    ; encoding: [0x01,0x05,0x08,0xf4,0x00,0x00,0x00,0xd4]
-
-s_load_dwordx4 s[20:23], s[2:3], vcc_hi
-// GFX11: s_load_b128 s[20:23], s[2:3], vcc_hi    ; encoding: [0x01,0x05,0x08,0xf4,0x00,0x00,0x00,0xd6]
-
-s_load_dwordx4 s[20:23], s[2:3], m0
-// GFX11: s_load_b128 s[20:23], s[2:3], m0        ; encoding: [0x01,0x05,0x08,0xf4,0x00,0x00,0x00,0xfa]
-
-s_load_dwordx4 s[20:23], s[2:3], 0x0
-// GFX11: s_load_b128 s[20:23], s[2:3], 0x0       ; encoding: [0x01,0x05,0x08,0xf4,0x00,0x00,0x00,0xf8]
-
-s_load_dwordx4 s[20:23], s[2:3], s0 glc
-// GFX11: s_load_b128 s[20:23], s[2:3], s0 glc    ; encoding: [0x01,0x45,0x08,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx4 s[20:23], s[2:3], s0 dlc
-// GFX11: s_load_b128 s[20:23], s[2:3], s0 dlc    ; encoding: [0x01,0x25,0x08,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx4 s[20:23], s[2:3], s0 glc dlc
-// GFX11: s_load_b128 s[20:23], s[2:3], s0 glc dlc ; encoding: [0x01,0x65,0x08,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx4 s[20:23], s[2:3], 0x1234 glc dlc
-// GFX11: s_load_b128 s[20:23], s[2:3], 0x1234 glc dlc ; encoding: [0x01,0x65,0x08,0xf4,0x34,0x12,0x00,0xf8]
-
 s_load_dwordx8 s[20:27], s[2:3], s0
 // GFX11: s_load_b256 s[20:27], s[2:3], s0        ; encoding: [0x01,0x05,0x0c,0xf4,0x00,0x00,0x00,0x00]
 
-s_load_dwordx8 s[24:31], s[2:3], s0
-// GFX11: s_load_b256 s[24:31], s[2:3], s0        ; encoding: [0x01,0x06,0x0c,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx8 s[92:99], s[2:3], s0
-// GFX11: s_load_b256 s[92:99], s[2:3], s0        ; encoding: [0x01,0x17,0x0c,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx8 s[20:27], s[4:5], s0
-// GFX11: s_load_b256 s[20:27], s[4:5], s0        ; encoding: [0x02,0x05,0x0c,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx8 s[20:27], s[100:101], s0
-// GFX11: s_load_b256 s[20:27], s[100:101], s0    ; encoding: [0x32,0x05,0x0c,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx8 s[20:27], vcc, s0
-// GFX11: s_load_b256 s[20:27], vcc, s0           ; encoding: [0x35,0x05,0x0c,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx8 s[20:27], s[2:3], s101
-// GFX11: s_load_b256 s[20:27], s[2:3], s101      ; encoding: [0x01,0x05,0x0c,0xf4,0x00,0x00,0x00,0xca]
-
-s_load_dwordx8 s[20:27], s[2:3], vcc_lo
-// GFX11: s_load_b256 s[20:27], s[2:3], vcc_lo    ; encoding: [0x01,0x05,0x0c,0xf4,0x00,0x00,0x00,0xd4]
-
-s_load_dwordx8 s[20:27], s[2:3], vcc_hi
-// GFX11: s_load_b256 s[20:27], s[2:3], vcc_hi    ; encoding: [0x01,0x05,0x0c,0xf4,0x00,0x00,0x00,0xd6]
-
-s_load_dwordx8 s[20:27], s[2:3], m0
-// GFX11: s_load_b256 s[20:27], s[2:3], m0        ; encoding: [0x01,0x05,0x0c,0xf4,0x00,0x00,0x00,0xfa]
-
-s_load_dwordx8 s[20:27], s[2:3], 0x0
-// GFX11: s_load_b256 s[20:27], s[2:3], 0x0       ; encoding: [0x01,0x05,0x0c,0xf4,0x00,0x00,0x00,0xf8]
-
-s_load_dwordx8 s[20:27], s[2:3], s0 glc
-// GFX11: s_load_b256 s[20:27], s[2:3], s0 glc    ; encoding: [0x01,0x45,0x0c,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx8 s[20:27], s[2:3], s0 dlc
-// GFX11: s_load_b256 s[20:27], s[2:3], s0 dlc    ; encoding: [0x01,0x25,0x0c,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx8 s[20:27], s[2:3], s0 glc dlc
-// GFX11: s_load_b256 s[20:27], s[2:3], s0 glc dlc ; encoding: [0x01,0x65,0x0c,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx8 s[20:27], s[2:3], 0x1234 glc dlc
-// GFX11: s_load_b256 s[20:27], s[2:3], 0x1234 glc dlc ; encoding: [0x01,0x65,0x0c,0xf4,0x34,0x12,0x00,0xf8]
-
 s_load_dwordx16 s[20:35], s[2:3], s0
 // GFX11: s_load_b512 s[20:35], s[2:3], s0        ; encoding: [0x01,0x05,0x10,0xf4,0x00,0x00,0x00,0x00]
 
-s_load_dwordx16 s[24:39], s[2:3], s0
-// GFX11: s_load_b512 s[24:39], s[2:3], s0        ; encoding: [0x01,0x06,0x10,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx16 s[84:99], s[2:3], s0
-// GFX11: s_load_b512 s[84:99], s[2:3], s0        ; encoding: [0x01,0x15,0x10,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx16 s[20:35], s[4:5], s0
-// GFX11: s_load_b512 s[20:35], s[4:5], s0        ; encoding: [0x02,0x05,0x10,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx16 s[20:35], s[100:101], s0
-// GFX11: s_load_b512 s[20:35], s[100:101], s0    ; encoding: [0x32,0x05,0x10,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx16 s[20:35], vcc, s0
-// GFX11: s_load_b512 s[20:35], vcc, s0           ; encoding: [0x35,0x05,0x10,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx16 s[20:35], s[2:3], s101
-// GFX11: s_load_b512 s[20:35], s[2:3], s101      ; encoding: [0x01,0x05,0x10,0xf4,0x00,0x00,0x00,0xca]
-
-s_load_dwordx16 s[20:35], s[2:3], vcc_lo
-// GFX11: s_load_b512 s[20:35], s[2:3], vcc_lo    ; encoding: [0x01,0x05,0x10,0xf4,0x00,0x00,0x00,0xd4]
-
-s_load_dwordx16 s[20:35], s[2:3], vcc_hi
-// GFX11: s_load_b512 s[20:35], s[2:3], vcc_hi    ; encoding: [0x01,0x05,0x10,0xf4,0x00,0x00,0x00,0xd6]
-
-s_load_dwordx16 s[20:35], s[2:3], m0
-// GFX11: s_load_b512 s[20:35], s[2:3], m0        ; encoding: [0x01,0x05,0x10,0xf4,0x00,0x00,0x00,0xfa]
-
-s_load_dwordx16 s[20:35], s[2:3], 0x0
-// GFX11: s_load_b512 s[20:35], s[2:3], 0x0       ; encoding: [0x01,0x05,0x10,0xf4,0x00,0x00,0x00,0xf8]
-
-s_load_dwordx16 s[20:35], s[2:3], s0 glc
-// GFX11: s_load_b512 s[20:35], s[2:3], s0 glc    ; encoding: [0x01,0x45,0x10,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx16 s[20:35], s[2:3], s0 dlc
-// GFX11: s_load_b512 s[20:35], s[2:3], s0 dlc    ; encoding: [0x01,0x25,0x10,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx16 s[20:35], s[2:3], s0 glc dlc
-// GFX11: s_load_b512 s[20:35], s[2:3], s0 glc dlc ; encoding: [0x01,0x65,0x10,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx16 s[20:35], s[2:3], 0x1234 glc dlc
-// GFX11: s_load_b512 s[20:35], s[2:3], 0x1234 glc dlc ; encoding: [0x01,0x65,0x10,0xf4,0x34,0x12,0x00,0xf8]
-
 s_buffer_load_dword s5, s[4:7], s0
 // GFX11: s_buffer_load_b32 s5, s[4:7], s0        ; encoding: [0x42,0x01,0x20,0xf4,0x00,0x00,0x00,0x00]
 
-s_buffer_load_dword s101, s[4:7], s0
-// GFX11: s_buffer_load_b32 s101, s[4:7], s0      ; encoding: [0x42,0x19,0x20,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dword vcc_lo, s[4:7], s0
-// GFX11: s_buffer_load_b32 vcc_lo, s[4:7], s0    ; encoding: [0x82,0x1a,0x20,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dword vcc_hi, s[4:7], s0
-// GFX11: s_buffer_load_b32 vcc_hi, s[4:7], s0    ; encoding: [0xc2,0x1a,0x20,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dword s5, s[8:11], s0
-// GFX11: s_buffer_load_b32 s5, s[8:11], s0       ; encoding: [0x44,0x01,0x20,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dword s5, s[96:99], s0
-// GFX11: s_buffer_load_b32 s5, s[96:99], s0      ; encoding: [0x70,0x01,0x20,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dword s5, s[4:7], s101
-// GFX11: s_buffer_load_b32 s5, s[4:7], s101      ; encoding: [0x42,0x01,0x20,0xf4,0x00,0x00,0x00,0xca]
-
-s_buffer_load_dword s5, s[4:7], vcc_lo
-// GFX11: s_buffer_load_b32 s5, s[4:7], vcc_lo    ; encoding: [0x42,0x01,0x20,0xf4,0x00,0x00,0x00,0xd4]
-
-s_buffer_load_dword s5, s[4:7], vcc_hi
-// GFX11: s_buffer_load_b32 s5, s[4:7], vcc_hi    ; encoding: [0x42,0x01,0x20,0xf4,0x00,0x00,0x00,0xd6]
-
-s_buffer_load_dword s5, s[4:7], m0
-// GFX11: s_buffer_load_b32 s5, s[4:7], m0        ; encoding: [0x42,0x01,0x20,0xf4,0x00,0x00,0x00,0xfa]
-
-s_buffer_load_dword s5, s[4:7], 0x0
-// GFX11: s_buffer_load_b32 s5, s[4:7], 0x0       ; encoding: [0x42,0x01,0x20,0xf4,0x00,0x00,0x00,0xf8]
-
-s_buffer_load_dword s5, s[4:7], s0 glc
-// GFX11: s_buffer_load_b32 s5, s[4:7], s0 glc    ; encoding: [0x42,0x41,0x20,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dword s5, s[4:7], s0 dlc
-// GFX11: s_buffer_load_b32 s5, s[4:7], s0 dlc    ; encoding: [0x42,0x21,0x20,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dword s5, s[4:7], s0 glc dlc
-// GFX11: s_buffer_load_b32 s5, s[4:7], s0 glc dlc ; encoding: [0x42,0x61,0x20,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dword s5, s[4:7], 0x1234 glc dlc
-// GFX11: s_buffer_load_b32 s5, s[4:7], 0x1234 glc dlc ; encoding: [0x42,0x61,0x20,0xf4,0x34,0x12,0x00,0xf8]
-
 s_buffer_load_dwordx2 s[10:11], s[4:7], s0
 // GFX11: s_buffer_load_b64 s[10:11], s[4:7], s0  ; encoding: [0x82,0x02,0x24,0xf4,0x00,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 s[12:13], s[4:7], s0
-// GFX11: s_buffer_load_b64 s[12:13], s[4:7], s0  ; encoding: [0x02,0x03,0x24,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx2 s[100:101], s[4:7], s0
-// GFX11: s_buffer_load_b64 s[100:101], s[4:7], s0 ; encoding: [0x02,0x19,0x24,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx2 vcc, s[4:7], s0
-// GFX11: s_buffer_load_b64 vcc, s[4:7], s0       ; encoding: [0x82,0x1a,0x24,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx2 s[10:11], s[8:11], s0
-// GFX11: s_buffer_load_b64 s[10:11], s[8:11], s0 ; encoding: [0x84,0x02,0x24,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx2 s[10:11], s[96:99], s0
-// GFX11: s_buffer_load_b64 s[10:11], s[96:99], s0 ; encoding: [0xb0,0x02,0x24,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx2 s[10:11], s[4:7], s101
-// GFX11: s_buffer_load_b64 s[10:11], s[4:7], s101 ; encoding: [0x82,0x02,0x24,0xf4,0x00,0x00,0x00,0xca]
-
-s_buffer_load_dwordx2 s[10:11], s[4:7], vcc_lo
-// GFX11: s_buffer_load_b64 s[10:11], s[4:7], vcc_lo ; encoding: [0x82,0x02,0x24,0xf4,0x00,0x00,0x00,0xd4]
-
-s_buffer_load_dwordx2 s[10:11], s[4:7], vcc_hi
-// GFX11: s_buffer_load_b64 s[10:11], s[4:7], vcc_hi ; encoding: [0x82,0x02,0x24,0xf4,0x00,0x00,0x00,0xd6]
-
-s_buffer_load_dwordx2 s[10:11], s[4:7], m0
-// GFX11: s_buffer_load_b64 s[10:11], s[4:7], m0  ; encoding: [0x82,0x02,0x24,0xf4,0x00,0x00,0x00,0xfa]
-
-s_buffer_load_dwordx2 s[10:11], s[4:7], 0x0
-// GFX11: s_buffer_load_b64 s[10:11], s[4:7], 0x0 ; encoding: [0x82,0x02,0x24,0xf4,0x00,0x00,0x00,0xf8]
-
-s_buffer_load_dwordx2 s[10:11], s[4:7], s0 glc
-// GFX11: s_buffer_load_b64 s[10:11], s[4:7], s0 glc ; encoding: [0x82,0x42,0x24,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx2 s[10:11], s[4:7], s0 dlc
-// GFX11: s_buffer_load_b64 s[10:11], s[4:7], s0 dlc ; encoding: [0x82,0x22,0x24,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx2 s[10:11], s[4:7], s0 glc dlc
-// GFX11: s_buffer_load_b64 s[10:11], s[4:7], s0 glc dlc ; encoding: [0x82,0x62,0x24,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx2 s[10:11], s[4:7], 0x1234 glc dlc
-// GFX11: s_buffer_load_b64 s[10:11], s[4:7], 0x1234 glc dlc ; encoding: [0x82,0x62,0x24,0xf4,0x34,0x12,0x00,0xf8]
-
 s_buffer_load_dwordx4 s[20:23], s[4:7], s0
 // GFX11: s_buffer_load_b128 s[20:23], s[4:7], s0 ; encoding: [0x02,0x05,0x28,0xf4,0x00,0x00,0x00,0x00]
 
-s_buffer_load_dwordx4 s[24:27], s[4:7], s0
-// GFX11: s_buffer_load_b128 s[24:27], s[4:7], s0 ; encoding: [0x02,0x06,0x28,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx4 s[96:99], s[4:7], s0
-// GFX11: s_buffer_load_b128 s[96:99], s[4:7], s0 ; encoding: [0x02,0x18,0x28,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx4 s[20:23], s[8:11], s0
-// GFX11: s_buffer_load_b128 s[20:23], s[8:11], s0 ; encoding: [0x04,0x05,0x28,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx4 s[20:23], s[96:99], s0
-// GFX11: s_buffer_load_b128 s[20:23], s[96:99], s0 ; encoding: [0x30,0x05,0x28,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx4 s[20:23], s[4:7], s101
-// GFX11: s_buffer_load_b128 s[20:23], s[4:7], s101 ; encoding: [0x02,0x05,0x28,0xf4,0x00,0x00,0x00,0xca]
-
-s_buffer_load_dwordx4 s[20:23], s[4:7], vcc_lo
-// GFX11: s_buffer_load_b128 s[20:23], s[4:7], vcc_lo ; encoding: [0x02,0x05,0x28,0xf4,0x00,0x00,0x00,0xd4]
-
-s_buffer_load_dwordx4 s[20:23], s[4:7], vcc_hi
-// GFX11: s_buffer_load_b128 s[20:23], s[4:7], vcc_hi ; encoding: [0x02,0x05,0x28,0xf4,0x00,0x00,0x00,0xd6]
-
-s_buffer_load_dwordx4 s[20:23], s[4:7], m0
-// GFX11: s_buffer_load_b128 s[20:23], s[4:7], m0 ; encoding: [0x02,0x05,0x28,0xf4,0x00,0x00,0x00,0xfa]
-
-s_buffer_load_dwordx4 s[20:23], s[4:7], 0x0
-// GFX11: s_buffer_load_b128 s[20:23], s[4:7], 0x0 ; encoding: [0x02,0x05,0x28,0xf4,0x00,0x00,0x00,0xf8]
-
-s_buffer_load_dwordx4 s[20:23], s[4:7], s0 glc
-// GFX11: s_buffer_load_b128 s[20:23], s[4:7], s0 glc ; encoding: [0x02,0x45,0x28,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx4 s[20:23], s[4:7], s0 dlc
-// GFX11: s_buffer_load_b128 s[20:23], s[4:7], s0 dlc ; encoding: [0x02,0x25,0x28,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx4 s[20:23], s[4:7], s0 glc dlc
-// GFX11: s_buffer_load_b128 s[20:23], s[4:7], s0 glc dlc ; encoding: [0x02,0x65,0x28,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx4 s[20:23], s[4:7], 0x1234 glc dlc
-// GFX11: s_buffer_load_b128 s[20:23], s[4:7], 0x1234 glc dlc ; encoding: [0x02,0x65,0x28,0xf4,0x34,0x12,0x00,0xf8]
-
 s_buffer_load_dwordx8 s[20:27], s[4:7], s0
 // GFX11: s_buffer_load_b256 s[20:27], s[4:7], s0 ; encoding: [0x02,0x05,0x2c,0xf4,0x00,0x00,0x00,0x00]
 
-s_buffer_load_dwordx8 s[24:31], s[4:7], s0
-// GFX11: s_buffer_load_b256 s[24:31], s[4:7], s0 ; encoding: [0x02,0x06,0x2c,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx8 s[92:99], s[4:7], s0
-// GFX11: s_buffer_load_b256 s[92:99], s[4:7], s0 ; encoding: [0x02,0x17,0x2c,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx8 s[20:27], s[8:11], s0
-// GFX11: s_buffer_load_b256 s[20:27], s[8:11], s0 ; encoding: [0x04,0x05,0x2c,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx8 s[20:27], s[96:99], s0
-// GFX11: s_buffer_load_b256 s[20:27], s[96:99], s0 ; encoding: [0x30,0x05,0x2c,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx8 s[20:27], s[4:7], s101
-// GFX11: s_buffer_load_b256 s[20:27], s[4:7], s101 ; encoding: [0x02,0x05,0x2c,0xf4,0x00,0x00,0x00,0xca]
-
-s_buffer_load_dwordx8 s[20:27], s[4:7], vcc_lo
-// GFX11: s_buffer_load_b256 s[20:27], s[4:7], vcc_lo ; encoding: [0x02,0x05,0x2c,0xf4,0x00,0x00,0x00,0xd4]
-
-s_buffer_load_dwordx8 s[20:27], s[4:7], vcc_hi
-// GFX11: s_buffer_load_b256 s[20:27], s[4:7], vcc_hi ; encoding: [0x02,0x05,0x2c,0xf4,0x00,0x00,0x00,0xd6]
-
-s_buffer_load_dwordx8 s[20:27], s[4:7], m0
-// GFX11: s_buffer_load_b256 s[20:27], s[4:7], m0 ; encoding: [0x02,0x05,0x2c,0xf4,0x00,0x00,0x00,0xfa]
-
-s_buffer_load_dwordx8 s[20:27], s[4:7], 0x0
-// GFX11: s_buffer_load_b256 s[20:27], s[4:7], 0x0 ; encoding: [0x02,0x05,0x2c,0xf4,0x00,0x00,0x00,0xf8]
-
-s_buffer_load_dwordx8 s[20:27], s[4:7], s0 glc
-// GFX11: s_buffer_load_b256 s[20:27], s[4:7], s0 glc ; encoding: [0x02,0x45,0x2c,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx8 s[20:27], s[4:7], s0 dlc
-// GFX11: s_buffer_load_b256 s[20:27], s[4:7], s0 dlc ; encoding: [0x02,0x25,0x2c,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx8 s[20:27], s[4:7], s0 glc dlc
-// GFX11: s_buffer_load_b256 s[20:27], s[4:7], s0 glc dlc ; encoding: [0x02,0x65,0x2c,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx8 s[20:27], s[4:7], 0x1234 glc dlc
-// GFX11: s_buffer_load_b256 s[20:27], s[4:7], 0x1234 glc dlc ; encoding: [0x02,0x65,0x2c,0xf4,0x34,0x12,0x00,0xf8]
-
 s_buffer_load_dwordx16 s[20:35], s[4:7], s0
 // GFX11: s_buffer_load_b512 s[20:35], s[4:7], s0 ; encoding: [0x02,0x05,0x30,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx16 s[24:39], s[4:7], s0
-// GFX11: s_buffer_load_b512 s[24:39], s[4:7], s0 ; encoding: [0x02,0x06,0x30,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx16 s[84:99], s[4:7], s0
-// GFX11: s_buffer_load_b512 s[84:99], s[4:7], s0 ; encoding: [0x02,0x15,0x30,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx16 s[20:35], s[8:11], s0
-// GFX11: s_buffer_load_b512 s[20:35], s[8:11], s0 ; encoding: [0x04,0x05,0x30,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx16 s[20:35], s[96:99], s0
-// GFX11: s_buffer_load_b512 s[20:35], s[96:99], s0 ; encoding: [0x30,0x05,0x30,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx16 s[20:35], s[4:7], s101
-// GFX11: s_buffer_load_b512 s[20:35], s[4:7], s101 ; encoding: [0x02,0x05,0x30,0xf4,0x00,0x00,0x00,0xca]
-
-s_buffer_load_dwordx16 s[20:35], s[4:7], vcc_lo
-// GFX11: s_buffer_load_b512 s[20:35], s[4:7], vcc_lo ; encoding: [0x02,0x05,0x30,0xf4,0x00,0x00,0x00,0xd4]
-
-s_buffer_load_dwordx16 s[20:35], s[4:7], vcc_hi
-// GFX11: s_buffer_load_b512 s[20:35], s[4:7], vcc_hi ; encoding: [0x02,0x05,0x30,0xf4,0x00,0x00,0x00,0xd6]
-
-s_buffer_load_dwordx16 s[20:35], s[4:7], m0
-// GFX11: s_buffer_load_b512 s[20:35], s[4:7], m0 ; encoding: [0x02,0x05,0x30,0xf4,0x00,0x00,0x00,0xfa]
-
-s_buffer_load_dwordx16 s[20:35], s[4:7], 0x0
-// GFX11: s_buffer_load_b512 s[20:35], s[4:7], 0x0 ; encoding: [0x02,0x05,0x30,0xf4,0x00,0x00,0x00,0xf8]
-
-s_buffer_load_dwordx16 s[20:35], s[4:7], s0 glc
-// GFX11: s_buffer_load_b512 s[20:35], s[4:7], s0 glc ; encoding: [0x02,0x45,0x30,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx16 s[20:35], s[4:7], s0 dlc
-// GFX11: s_buffer_load_b512 s[20:35], s[4:7], s0 dlc ; encoding: [0x02,0x25,0x30,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx16 s[20:35], s[4:7], s0 glc dlc
-// GFX11: s_buffer_load_b512 s[20:35], s[4:7], s0 glc dlc ; encoding: [0x02,0x65,0x30,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx16 s[20:35], s[4:7], 0x1234 glc dlc
-// GFX11: s_buffer_load_b512 s[20:35], s[4:7], 0x1234 glc dlc ; encoding: [0x02,0x65,0x30,0xf4,0x34,0x12,0x00,0xf8]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_alias.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_alias.s
index 857f2fdfc41b..a8cc90ef6f8b 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_alias.s
@@ -2,7 +2,7 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck -check-prefix=GFX11 %s
 
 v_cvt_pknorm_i16_f16 v5, v1, v2
-// GFX11: v_cvt_pk_norm_i16_f16 {{.*}} encoding: [0x05,0x00,0x12,0xd7,0x01,0x05,0x02,0x00]
+// GFX11: v_cvt_pk_norm_i16_f16 v5, v1, v2        ; encoding: [0x05,0x00,0x12,0xd7,0x01,0x05,0x02,0x00]
 
 v_cvt_pknorm_u16_f16 v5, v1, v2
-// GFX11: v_cvt_pk_norm_u16_f16 {{.*}} encoding: [0x05,0x00,0x13,0xd7,0x01,0x05,0x02,0x00]
+// GFX11: v_cvt_pk_norm_u16_f16 v5, v1, v2        ; encoding: [0x05,0x00,0x13,0xd7,0x01,0x05,0x02,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s
index aa4b02816017..aa063c8800aa 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s
@@ -1,25 +1,31 @@
 // RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
 
 ds_max_f32 v1, v2
-// GFX12: [0x00,0x00,0x4c,0xd8,0x01,0x02,0x00,0x00]
+// GFX12: ds_max_num_f32 v1, v2                   ; encoding: [0x00,0x00,0x4c,0xd8,0x01,0x02,0x00,0x00]
 
 ds_max_f64 v1, v[2:3]
-// GFX12: [0x00,0x00,0x4c,0xd9,0x01,0x02,0x00,0x00]
+// GFX12: ds_max_num_f64 v1, v[2:3]               ; encoding: [0x00,0x00,0x4c,0xd9,0x01,0x02,0x00,0x00]
 
 ds_max_rtn_f32 v5, v1, v2
-// GFX12: [0x00,0x00,0xcc,0xd8,0x01,0x02,0x00,0x05]
+// GFX12: ds_max_num_rtn_f32 v5, v1, v2           ; encoding: [0x00,0x00,0xcc,0xd8,0x01,0x02,0x00,0x05]
 
 ds_max_rtn_f64 v[5:6], v1, v[2:3]
-// GFX12: [0x00,0x00,0xcc,0xd9,0x01,0x02,0x00,0x05]
+// GFX12: ds_max_num_rtn_f64 v[5:6], v1, v[2:3]   ; encoding: [0x00,0x00,0xcc,0xd9,0x01,0x02,0x00,0x05]
 
 ds_min_f32 v1, v2
-// GFX12: [0x00,0x00,0x48,0xd8,0x01,0x02,0x00,0x00]
+// GFX12: ds_min_num_f32 v1, v2                   ; encoding: [0x00,0x00,0x48,0xd8,0x01,0x02,0x00,0x00]
 
 ds_min_f64 v1, v[2:3]
-// GFX12: [0x00,0x00,0x48,0xd9,0x01,0x02,0x00,0x00]
+// GFX12: ds_min_num_f64 v1, v[2:3]               ; encoding: [0x00,0x00,0x48,0xd9,0x01,0x02,0x00,0x00]
 
 ds_min_rtn_f32 v5, v1, v2
-// GFX12: [0x00,0x00,0xc8,0xd8,0x01,0x02,0x00,0x05]
+// GFX12: ds_min_num_rtn_f32 v5, v1, v2           ; encoding: [0x00,0x00,0xc8,0xd8,0x01,0x02,0x00,0x05]
 
 ds_min_rtn_f64 v[5:6], v1, v[2:3]
-// GFX12: [0x00,0x00,0xc8,0xd9,0x01,0x02,0x00,0x05]
+// GFX12: ds_min_num_rtn_f64 v[5:6], v1, v[2:3]   ; encoding: [0x00,0x00,0xc8,0xd9,0x01,0x02,0x00,0x05]
+
+ds_subrev_u32 v1, v2
+// GFX12: ds_rsub_u32 v1, v2                      ; encoding: [0x00,0x00,0x08,0xd8,0x01,0x02,0x00,0x00]
+
+ds_subrev_u64 v1, v[2:3]
+// GFX12: ds_rsub_u64 v1, v[2:3]                  ; encoding: [0x00,0x00,0x08,0xd9,0x01,0x02,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sop2_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_sop2_alias.s
index 86c3bdbaf830..e3ad0198cae5 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sop2_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sop2_alias.s
@@ -1,223 +1,37 @@
 // RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1200 %s | FileCheck --check-prefix=GFX12 %s
 
 s_add_i32 s0, s1, s2
-// GFX12: encoding: [0x01,0x02,0x00,0x81]
+// GFX12: s_add_co_i32 s0, s1, s2                 ; encoding: [0x01,0x02,0x00,0x81]
 
 s_add_u32 s0, s1, s2
-// GFX12: encoding: [0x01,0x02,0x00,0x80]
+// GFX12: s_add_co_u32 s0, s1, s2                 ; encoding: [0x01,0x02,0x00,0x80]
 
 s_add_u64 s[0:1], s[2:3], s[4:5]
-// GFX12: encoding: [0x02,0x04,0x80,0xa9]
+// GFX12: s_add_nc_u64 s[0:1], s[2:3], s[4:5]     ; encoding: [0x02,0x04,0x80,0xa9]
 
 s_addc_u32 s0, s1, s2
-// GFX12: encoding: [0x01,0x02,0x00,0x82]
+// GFX12: s_add_co_ci_u32 s0, s1, s2              ; encoding: [0x01,0x02,0x00,0x82]
 
 s_sub_i32 s0, s1, s2
-// GFX12: encoding: [0x01,0x02,0x80,0x81]
+// GFX12: s_sub_co_i32 s0, s1, s2                 ; encoding: [0x01,0x02,0x80,0x81]
 
 s_sub_u32 s0, s1, s2
-// GFX12: encoding: [0x01,0x02,0x80,0x80]
+// GFX12: s_sub_co_u32 s0, s1, s2                 ; encoding: [0x01,0x02,0x80,0x80]
 
 s_sub_u64 s[0:1], s[2:3], s[4:5]
-// GFX12: encoding: [0x02,0x04,0x00,0xaa]
+// GFX12: s_sub_nc_u64 s[0:1], s[2:3], s[4:5]     ; encoding: [0x02,0x04,0x00,0xaa]
 
 s_subb_u32 s0, s1, s2
-// GFX12: encoding: [0x01,0x02,0x80,0x82]
+// GFX12: s_sub_co_ci_u32 s0, s1, s2              ; encoding: [0x01,0x02,0x80,0x82]
 
 s_min_f32 s5, s1, s2
-// GFX12: encoding: [0x01,0x02,0x05,0xa1]
-
-s_min_f32 s105, s1, s2
-// GFX12: encoding: [0x01,0x02,0x69,0xa1]
-
-s_min_f32 s5, s105, s2
-// GFX12: encoding: [0x69,0x02,0x05,0xa1]
-
-s_min_f32 s5, s103, s2
-// GFX12: encoding: [0x67,0x02,0x05,0xa1]
-
-s_min_f32 s5, vcc_lo, s2
-// GFX12: encoding: [0x6a,0x02,0x05,0xa1]
-
-s_min_f32 s5, vcc_hi, s2
-// GFX12: encoding: [0x6b,0x02,0x05,0xa1]
-
-s_min_f32 s5, ttmp11, s2
-// GFX12: encoding: [0x77,0x02,0x05,0xa1]
-
-s_min_f32 s5, m0, s2
-// GFX12: encoding: [0x7d,0x02,0x05,0xa1]
-
-s_min_f32 s5, exec_lo, s2
-// GFX12: encoding: [0x7e,0x02,0x05,0xa1]
-
-s_min_f32 s5, exec_hi, s2
-// GFX12: encoding: [0x7f,0x02,0x05,0xa1]
-
-s_min_f32 s5, 0, s2
-// GFX12: encoding: [0x80,0x02,0x05,0xa1]
-
-s_min_f32 s5, -1, s2
-// GFX12: encoding: [0xc1,0x02,0x05,0xa1]
-
-s_min_f32 s5, 0.5, s2
-// GFX12: encoding: [0xf0,0x02,0x05,0xa1]
-
-s_min_f32 s5, -4.0, s2
-// GFX12: encoding: [0xf7,0x02,0x05,0xa1]
-
-s_min_f32 s5, 0xaf123456, s2
-// GFX12: encoding: [0xff,0x02,0x05,0xa1,0x56,0x34,0x12,0xaf]
-
-s_min_f32 s5, 0x3f717273, s2
-// GFX12: encoding: [0xff,0x02,0x05,0xa1,0x73,0x72,0x71,0x3f]
-
-s_min_f32 s5, s1, s105
-// GFX12: encoding: [0x01,0x69,0x05,0xa1]
+// GFX12: s_min_num_f32 s5, s1, s2                ; encoding: [0x01,0x02,0x05,0xa1]
 
 s_max_f32 s5, s1, s2
-// GFX12: encoding: [0x01,0x02,0x85,0xa1]
-
-s_max_f32 s105, s1, s2
-// GFX12: encoding: [0x01,0x02,0xe9,0xa1]
-
-s_max_f32 s5, s105, s2
-// GFX12: encoding: [0x69,0x02,0x85,0xa1]
-
-s_max_f32 s5, s103, s2
-// GFX12: encoding: [0x67,0x02,0x85,0xa1]
-
-s_max_f32 s5, vcc_lo, s2
-// GFX12: encoding: [0x6a,0x02,0x85,0xa1]
-
-s_max_f32 s5, vcc_hi, s2
-// GFX12: encoding: [0x6b,0x02,0x85,0xa1]
-
-s_max_f32 s5, ttmp11, s2
-// GFX12: encoding: [0x77,0x02,0x85,0xa1]
-
-s_max_f32 s5, m0, s2
-// GFX12: encoding: [0x7d,0x02,0x85,0xa1]
-
-s_max_f32 s5, exec_lo, s2
-// GFX12: encoding: [0x7e,0x02,0x85,0xa1]
-
-s_max_f32 s5, exec_hi, s2
-// GFX12: encoding: [0x7f,0x02,0x85,0xa1]
-
-s_max_f32 s5, 0, s2
-// GFX12: encoding: [0x80,0x02,0x85,0xa1]
-
-s_max_f32 s5, -1, s2
-// GFX12: encoding: [0xc1,0x02,0x85,0xa1]
-
-s_max_f32 s5, 0.5, s2
-// GFX12: encoding: [0xf0,0x02,0x85,0xa1]
-
-s_max_f32 s5, -4.0, s2
-// GFX12: encoding: [0xf7,0x02,0x85,0xa1]
-
-s_max_f32 s5, 0xaf123456, s2
-// GFX12: encoding: [0xff,0x02,0x85,0xa1,0x56,0x34,0x12,0xaf]
-
-s_max_f32 s5, 0x3f717273, s2
-// GFX12: encoding: [0xff,0x02,0x85,0xa1,0x73,0x72,0x71,0x3f]
-
-s_max_f32 s5, s1, s105
-// GFX12: encoding: [0x01,0x69,0x85,0xa1]
+// GFX12: s_max_num_f32 s5, s1, s2                ; encoding: [0x01,0x02,0x85,0xa1]
 
 s_max_f16 s5, s1, s2
-// GFX12: encoding: [0x01,0x02,0x05,0xa6]
-
-s_max_f16 s105, s1, s2
-// GFX12: encoding: [0x01,0x02,0x69,0xa6]
-
-s_max_f16 s5, s105, s2
-// GFX12: encoding: [0x69,0x02,0x05,0xa6]
-
-s_max_f16 s5, s101, s2
-// GFX12: encoding: [0x65,0x02,0x05,0xa6]
-
-s_max_f16 s5, vcc_lo, s2
-// GFX12: encoding: [0x6a,0x02,0x05,0xa6]
-
-s_max_f16 s5, vcc_hi, s2
-// GFX12: encoding: [0x6b,0x02,0x05,0xa6]
-
-s_max_f16 s5, m0, s2
-// GFX12: encoding: [0x7d,0x02,0x05,0xa6]
-
-s_max_f16 s5, exec_lo, s2
-// GFX12: encoding: [0x7e,0x02,0x05,0xa6]
-
-s_max_f16 s5, exec_hi, s2
-// GFX12: encoding: [0x7f,0x02,0x05,0xa6]
-
-s_max_f16 s5, 0, s2
-// GFX12: encoding: [0x80,0x02,0x05,0xa6]
-
-s_max_f16 s5, -1, s2
-// GFX12: encoding: [0xc1,0x02,0x05,0xa6]
-
-s_max_f16 s5, 0.5, s2
-// GFX12: encoding: [0xf0,0x02,0x05,0xa6]
-
-s_max_f16 s5, -4.0, s2
-// GFX12: encoding: [0xf7,0x02,0x05,0xa6]
-
-s_max_f16 s5, 0xfe0b, s2
-// GFX12: encoding: [0xff,0x02,0x05,0xa6,0x0b,0xfe,0x00,0x00]
-
-s_max_f16 s5, 0x3456, s2
-// GFX12: encoding: [0xff,0x02,0x05,0xa6,0x56,0x34,0x00,0x00]
-
-s_max_f16 s5, s1, s105
-// GFX12: encoding: [0x01,0x69,0x05,0xa6]
+// GFX12: s_max_num_f16 s5, s1, s2                ; encoding: [0x01,0x02,0x05,0xa6]
 
 s_min_f16 s5, s1, s2
-// GFX12: encoding: [0x01,0x02,0x85,0xa5]
-
-s_min_f16 s105, s1, s2
-// GFX12: encoding: [0x01,0x02,0xe9,0xa5]
-
-s_min_f16 s5, s105, s2
-// GFX12: encoding: [0x69,0x02,0x85,0xa5]
-
-s_min_f16 s5, s101, s2
-// GFX12: encoding: [0x65,0x02,0x85,0xa5]
-
-s_min_f16 s5, vcc_lo, s2
-// GFX12: encoding: [0x6a,0x02,0x85,0xa5]
-
-s_min_f16 s5, vcc_hi, s2
-// GFX12: encoding: [0x6b,0x02,0x85,0xa5]
-
-s_min_f16 s5, m0, s2
-// GFX12: encoding: [0x7d,0x02,0x85,0xa5]
-
-s_min_f16 s5, exec_lo, s2
-// GFX12: encoding: [0x7e,0x02,0x85,0xa5]
-
-s_min_f16 s5, exec_hi, s2
-// GFX12: encoding: [0x7f,0x02,0x85,0xa5]
-
-s_min_f16 s5, 0, s2
-// GFX12: encoding: [0x80,0x02,0x85,0xa5]
-
-s_min_f16 s5, -1, s2
-// GFX12: encoding: [0xc1,0x02,0x85,0xa5]
-
-s_min_f16 s5, 0.5, s2
-// GFX12: encoding: [0xf0,0x02,0x85,0xa5]
-
-s_min_f16 s5, -4.0, s2
-// GFX12: encoding: [0xf7,0x02,0x85,0xa5]
-
-s_min_f16 s5, 0xfe0b, s2
-// GFX12: encoding: [0xff,0x02,0x85,0xa5,0x0b,0xfe,0x00,0x00]
-
-s_min_f16 s5, 0x3456, s2
-// GFX12: encoding: [0xff,0x02,0x85,0xa5,0x56,0x34,0x00,0x00]
-
-s_min_f16 s5, s1, s105
-// GFX12: encoding: [0x01,0x69,0x85,0xa5]
+// GFX12: s_min_num_f16 s5, s1, s2                ; encoding: [0x01,0x02,0x85,0xa5]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sopk_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_sopk_alias.s
index 283890be82ac..aae08fda8d6e 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sopk_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sopk_alias.s
@@ -1,4 +1,4 @@
 // RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1200 %s | FileCheck --check-prefix=GFX12 %s
 
 s_addk_i32 s0, 0x1234
-// GFX12: encoding: [0x34,0x12,0x80,0xb7]
+// GFX12: s_addk_co_i32 s0, 0x1234                ; encoding: [0x34,0x12,0x80,0xb7]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_alias.s
index 15d6269b6f59..ef00edcf846e 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_alias.s
@@ -1,434 +1,25 @@
-// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
-// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX12-ERR --implicit-check-not=error: %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
 
 tbuffer_load_format_d16_x v4, off, s[8:11], s3 format:[BUF_FMT_8_UNORM] offset:8388607
-// GFX12: encoding: [0x03,0x00,0x22,0xc4,0x04,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_x v255, off, s[8:11], s3 format:1 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x22,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_x v4, off, s[12:15], s3 format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_UNORM] offset:8388607
-// GFX12: encoding: [0x03,0x00,0x22,0xc4,0x04,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_x v4, off, s[12:15], s101 format:[BUF_FMT_8_SNORM] offset:8388607
-// GFX12: encoding: [0x65,0x00,0x22,0xc4,0x04,0x18,0x00,0x01,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_x v4, off, s[12:15], m0 format:2 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x22,0xc4,0x04,0x18,0x00,0x01,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s0 format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_SNORM] offset:8388607
-// GFX12: encoding: [0x00,0x00,0x22,0xc4,0x04,0x10,0x00,0x01,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s61 format:[BUF_FMT_8_USCALED] offset:8388607
-// GFX12: encoding: [0x3d,0x00,0x22,0xc4,0x04,0x10,0x80,0x01,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], s61 format:3 offset:8388607
-// GFX12: encoding: [0x3d,0x00,0x22,0xc4,0x04,0xe0,0x80,0x01,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_x v4, v1, s[8:11], s3 format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_USCALED] offen offset:52
-// GFX12: encoding: [0x03,0x00,0x22,0xc4,0x04,0x10,0x80,0x41,0x01,0x34,0x00,0x00]
-
-tbuffer_load_format_d16_x v4, v1, s[8:11], s3 format:[BUF_FMT_8_SSCALED] idxen offset:52
-// GFX12: encoding: [0x03,0x00,0x22,0xc4,0x04,0x10,0x00,0x82,0x01,0x34,0x00,0x00]
-
-tbuffer_load_format_d16_x v4, v[1:2], s[8:11], s0 format:4 idxen offen offset:52
-// GFX12: encoding: [0x00,0x00,0x22,0xc4,0x04,0x10,0x00,0xc2,0x01,0x34,0x00,0x00]
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_SSCALED] offset:8388607 th:TH_LOAD_NT_HT scope:SCOPE_DEV
-// GFX12: encoding: [0x03,0x00,0x22,0xc4,0x04,0xe0,0x68,0x02,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], s3 format:[BUF_FMT_8_UINT] offset:8388607 th:TH_LOAD_BYPASS scope:SCOPE_SYS
-// GFX12: encoding: [0x03,0x00,0x22,0xc4,0x04,0xe0,0xbc,0x02,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], s3 format:5 offset:8388607 scope:SCOPE_SE
-// GFX12: encoding: [0x03,0x00,0x22,0xc4,0x04,0xe0,0x84,0x02,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], 0 format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_UINT] offset:8388607
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_UINT] offset:8388607 glc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_UINT] offset:8388607 slc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_UINT] offset:8388607 dlc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12: tbuffer_load_d16_format_x v4, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x22,0xc4,0x04,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 tbuffer_load_format_d16_xy v4, off, s[8:11], s3 format:[BUF_FMT_8_SINT] offset:8388607
-// GFX12: encoding: [0x03,0x40,0x22,0xc4,0x04,0x10,0x00,0x03,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xy v255, off, s[8:11], s3 format:6 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x22,0xc4,0xff,0x10,0x00,0x03,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xy v4, off, s[12:15], s3 format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_SINT] offset:8388607
-// GFX12: encoding: [0x03,0x40,0x22,0xc4,0x04,0x18,0x00,0x03,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xy v4, off, s[12:15], s101 format:[BUF_FMT_16_UNORM] offset:8388607
-// GFX12: encoding: [0x65,0x40,0x22,0xc4,0x04,0x18,0x80,0x03,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xy v4, off, s[12:15], m0 format:7 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x22,0xc4,0x04,0x18,0x80,0x03,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xy v4, off, s[8:11], s0 format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_UNORM] offset:8388607
-// GFX12: encoding: [0x00,0x40,0x22,0xc4,0x04,0x10,0x80,0x03,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xy v4, off, s[8:11], s61 format:[BUF_FMT_16_SNORM] offset:8388607
-// GFX12: encoding: [0x3d,0x40,0x22,0xc4,0x04,0x10,0x00,0x04,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], s61 format:8 offset:8388607
-// GFX12: encoding: [0x3d,0x40,0x22,0xc4,0x04,0xe0,0x00,0x04,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xy v4, v1, s[8:11], s3 format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_SNORM] offen offset:52
-// GFX12: encoding: [0x03,0x40,0x22,0xc4,0x04,0x10,0x00,0x44,0x01,0x34,0x00,0x00]
-
-tbuffer_load_format_d16_xy v4, v1, s[8:11], s3 format:[BUF_FMT_16_USCALED] idxen offset:52
-// GFX12: encoding: [0x03,0x40,0x22,0xc4,0x04,0x10,0x80,0x84,0x01,0x34,0x00,0x00]
-
-tbuffer_load_format_d16_xy v4, v[1:2], s[8:11], s0 format:9 idxen offen offset:52
-// GFX12: encoding: [0x00,0x40,0x22,0xc4,0x04,0x10,0x80,0xc4,0x01,0x34,0x00,0x00]
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_USCALED] offset:8388607 th:TH_LOAD_NT_HT scope:SCOPE_DEV
-// GFX12: encoding: [0x03,0x40,0x22,0xc4,0x04,0xe0,0xe8,0x04,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], s3 format:[BUF_FMT_16_SSCALED] offset:8388607 th:TH_LOAD_BYPASS scope:SCOPE_SYS
-// GFX12: encoding: [0x03,0x40,0x22,0xc4,0x04,0xe0,0x3c,0x05,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], s3 format:10 offset:8388607 scope:SCOPE_SE
-// GFX12: encoding: [0x03,0x40,0x22,0xc4,0x04,0xe0,0x04,0x05,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], 0 format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_SSCALED] offset:8388607
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_SSCALED] offset:8388607 glc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_SSCALED] offset:8388607 slc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_SSCALED] offset:8388607 dlc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12: tbuffer_load_d16_format_xy v4, off, s[8:11], s3 format:[BUF_FMT_8_SINT] offset:8388607 ; encoding: [0x03,0x40,0x22,0xc4,0x04,0x10,0x00,0x03,0x00,0xff,0xff,0x7f]
 
 tbuffer_load_format_d16_xyz v[4:5], off, s[8:11], s3 format:[BUF_FMT_16_UINT] offset:8388607
-// GFX12: encoding: [0x03,0x80,0x22,0xc4,0x04,0x10,0x80,0x05,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyz v[254:255], off, s[8:11], s3 format:11 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x22,0xc4,0xfe,0x10,0x80,0x05,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyz v[4:5], off, s[12:15], s3 format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_UINT] offset:8388607
-// GFX12: encoding: [0x03,0x80,0x22,0xc4,0x04,0x18,0x80,0x05,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyz v[4:5], off, s[12:15], s101 format:[BUF_FMT_16_SINT] offset:8388607
-// GFX12: encoding: [0x65,0x80,0x22,0xc4,0x04,0x18,0x00,0x06,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyz v[4:5], off, s[12:15], m0 format:12 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x22,0xc4,0x04,0x18,0x00,0x06,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyz v[4:5], off, s[8:11], s0 format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_SINT] offset:8388607
-// GFX12: encoding: [0x00,0x80,0x22,0xc4,0x04,0x10,0x00,0x06,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyz v[4:5], off, s[8:11], s61 format:[BUF_FMT_16_FLOAT] offset:8388607
-// GFX12: encoding: [0x3d,0x80,0x22,0xc4,0x04,0x10,0x80,0x06,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], s61 format:13 offset:8388607
-// GFX12: encoding: [0x3d,0x80,0x22,0xc4,0x04,0xe0,0x80,0x06,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyz v[4:5], v1, s[8:11], s3 format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_FLOAT] offen offset:52
-// GFX12: encoding: [0x03,0x80,0x22,0xc4,0x04,0x10,0x80,0x46,0x01,0x34,0x00,0x00]
-
-tbuffer_load_format_d16_xyz v[4:5], v1, s[8:11], s3 format:[BUF_FMT_8_8_UNORM] idxen offset:52
-// GFX12: encoding: [0x03,0x80,0x22,0xc4,0x04,0x10,0x00,0x87,0x01,0x34,0x00,0x00]
-
-tbuffer_load_format_d16_xyz v[4:5], v[1:2], s[8:11], s0 format:14 idxen offen offset:52
-// GFX12: encoding: [0x00,0x80,0x22,0xc4,0x04,0x10,0x00,0xc7,0x01,0x34,0x00,0x00]
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_UNORM] offset:8388607 th:TH_LOAD_NT_HT scope:SCOPE_DEV
-// GFX12: encoding: [0x03,0x80,0x22,0xc4,0x04,0xe0,0x68,0x07,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], s3 format:[BUF_FMT_8_8_SNORM] offset:8388607 th:TH_LOAD_BYPASS scope:SCOPE_SYS
-// GFX12: encoding: [0x03,0x80,0x22,0xc4,0x04,0xe0,0xbc,0x07,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], s3 format:15 offset:8388607 scope:SCOPE_SE
-// GFX12: encoding: [0x03,0x80,0x22,0xc4,0x04,0xe0,0x84,0x07,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], 0 format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_SNORM] offset:8388607
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_SNORM] offset:8388607 glc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_SNORM] offset:8388607 slc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_SNORM] offset:8388607 dlc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12: tbuffer_load_d16_format_xyz v[4:5], off, s[8:11], s3 format:[BUF_FMT_16_UINT] offset:8388607 ; encoding: [0x03,0x80,0x22,0xc4,0x04,0x10,0x80,0x05,0x00,0xff,0xff,0x7f]
 
 tbuffer_load_format_d16_xyzw v[4:5], off, s[8:11], s3 format:[BUF_FMT_8_8_USCALED] offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x22,0xc4,0x04,0x10,0x00,0x08,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyzw v[254:255], off, s[8:11], s3 format:16 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x22,0xc4,0xfe,0x10,0x00,0x08,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, s[12:15], s3 format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_USCALED] offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x22,0xc4,0x04,0x18,0x00,0x08,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, s[12:15], s101 format:[BUF_FMT_8_8_SSCALED] offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x22,0xc4,0x04,0x18,0x80,0x08,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, s[12:15], m0 format:17 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x22,0xc4,0x04,0x18,0x80,0x08,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, s[8:11], s0 format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_SSCALED] offset:8388607
-// GFX12: encoding: [0x00,0xc0,0x22,0xc4,0x04,0x10,0x80,0x08,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, s[8:11], s61 format:[BUF_FMT_8_8_UINT] offset:8388607
-// GFX12: encoding: [0x3d,0xc0,0x22,0xc4,0x04,0x10,0x00,0x09,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], s61 format:18 offset:8388607
-// GFX12: encoding: [0x3d,0xc0,0x22,0xc4,0x04,0xe0,0x00,0x09,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyzw v[4:5], v1, s[8:11], s3 format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_UINT] offen offset:52
-// GFX12: encoding: [0x03,0xc0,0x22,0xc4,0x04,0x10,0x00,0x49,0x01,0x34,0x00,0x00]
-
-tbuffer_load_format_d16_xyzw v[4:5], v1, s[8:11], s3 format:[BUF_FMT_8_8_SINT] idxen offset:52
-// GFX12: encoding: [0x03,0xc0,0x22,0xc4,0x04,0x10,0x80,0x89,0x01,0x34,0x00,0x00]
-
-tbuffer_load_format_d16_xyzw v[4:5], v[1:2], s[8:11], s0 format:19 idxen offen offset:52
-// GFX12: encoding: [0x00,0xc0,0x22,0xc4,0x04,0x10,0x80,0xc9,0x01,0x34,0x00,0x00]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_SINT] offset:8388607 th:TH_LOAD_NT_HT scope:SCOPE_DEV
-// GFX12: encoding: [0x03,0xc0,0x22,0xc4,0x04,0xe0,0xe8,0x09,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], s3 format:[BUF_FMT_32_UINT] offset:8388607 th:TH_LOAD_BYPASS scope:SCOPE_SYS
-// GFX12: encoding: [0x03,0xc0,0x22,0xc4,0x04,0xe0,0x3c,0x0a,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], s3 format:20 offset:8388607 scope:SCOPE_SE
-// GFX12: encoding: [0x03,0xc0,0x22,0xc4,0x04,0xe0,0x04,0x0a,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], 0 format:[BUF_DATA_FORMAT_32, BUF_NUM_FORMAT_UINT] offset:8388607
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_32, BUF_NUM_FORMAT_UINT] offset:8388607 glc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_32, BUF_NUM_FORMAT_UINT] offset:8388607 slc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_32, BUF_NUM_FORMAT_UINT] offset:8388607 dlc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12: tbuffer_load_d16_format_xyzw v[4:5], off, s[8:11], s3 format:[BUF_FMT_8_8_USCALED] offset:8388607 ; encoding: [0x03,0xc0,0x22,0xc4,0x04,0x10,0x00,0x08,0x00,0xff,0xff,0x7f]
 
 tbuffer_store_format_d16_x v4, off, s[8:11], s3 format:[BUF_FMT_2_10_10_10_SINT] offset:8388607
-// GFX12: encoding: [0x03,0x00,0x23,0xc4,0x04,0x10,0x80,0x14,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_x v255, off, s[8:11], s3 format:41 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x23,0xc4,0xff,0x10,0x80,0x14,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_x v4, off, s[12:15], s3 format:[BUF_DATA_FORMAT_2_10_10_10, BUF_NUM_FORMAT_SINT] offset:8388607
-// GFX12: encoding: [0x03,0x00,0x23,0xc4,0x04,0x18,0x80,0x14,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_x v4, off, s[12:15], s101 format:[BUF_FMT_8_8_8_8_UNORM] offset:8388607
-// GFX12: encoding: [0x65,0x00,0x23,0xc4,0x04,0x18,0x00,0x15,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_x v4, off, s[12:15], m0 format:42 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x23,0xc4,0x04,0x18,0x00,0x15,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_x v4, off, s[8:11], s0 format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_UNORM] offset:8388607
-// GFX12: encoding: [0x00,0x00,0x23,0xc4,0x04,0x10,0x00,0x15,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_x v4, off, s[8:11], s61 format:[BUF_FMT_8_8_8_8_SNORM] offset:8388607
-// GFX12: encoding: [0x3d,0x00,0x23,0xc4,0x04,0x10,0x80,0x15,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], s61 format:43 offset:8388607
-// GFX12: encoding: [0x3d,0x00,0x23,0xc4,0x04,0xe0,0x80,0x15,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_x v4, v1, s[8:11], s3 format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_SNORM] offen offset:52
-// GFX12: encoding: [0x03,0x00,0x23,0xc4,0x04,0x10,0x80,0x55,0x01,0x34,0x00,0x00]
-
-tbuffer_store_format_d16_x v4, v1, s[8:11], s3 format:[BUF_FMT_8_8_8_8_USCALED] idxen offset:52
-// GFX12: encoding: [0x03,0x00,0x23,0xc4,0x04,0x10,0x00,0x96,0x01,0x34,0x00,0x00]
-
-tbuffer_store_format_d16_x v4, v[1:2], s[8:11], s0 format:44 idxen offen offset:52
-// GFX12: encoding: [0x00,0x00,0x23,0xc4,0x04,0x10,0x00,0xd6,0x01,0x34,0x00,0x00]
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_USCALED] offset:8388607 th:TH_STORE_NT_HT scope:SCOPE_DEV
-// GFX12: encoding: [0x03,0x00,0x23,0xc4,0x04,0xe0,0x68,0x16,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], s3 format:[BUF_FMT_8_8_8_8_SSCALED] offset:8388607 th:TH_STORE_BYPASS scope:SCOPE_SYS
-// GFX12: encoding: [0x03,0x00,0x23,0xc4,0x04,0xe0,0xbc,0x16,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], s3 format:45 offset:8388607 scope:SCOPE_SE
-// GFX12: encoding: [0x03,0x00,0x23,0xc4,0x04,0xe0,0x84,0x16,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], 0 format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_SSCALED] offset:8388607
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_SSCALED] offset:8388607 glc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_SSCALED] offset:8388607 slc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_SSCALED] offset:8388607 dlc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12: tbuffer_store_d16_format_x v4, off, s[8:11], s3 format:[BUF_FMT_2_10_10_10_SINT] offset:8388607 ; encoding: [0x03,0x00,0x23,0xc4,0x04,0x10,0x80,0x14,0x00,0xff,0xff,0x7f]
 
 tbuffer_store_format_d16_xy v4, off, s[8:11], s3 format:[BUF_FMT_8_8_8_8_UINT] offset:8388607
-// GFX12: encoding: [0x03,0x40,0x23,0xc4,0x04,0x10,0x00,0x17,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xy v255, off, s[8:11], s3 format:46 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x23,0xc4,0xff,0x10,0x00,0x17,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xy v4, off, s[12:15], s3 format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_UINT] offset:8388607
-// GFX12: encoding: [0x03,0x40,0x23,0xc4,0x04,0x18,0x00,0x17,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xy v4, off, s[12:15], s101 format:[BUF_FMT_8_8_8_8_SINT] offset:8388607
-// GFX12: encoding: [0x65,0x40,0x23,0xc4,0x04,0x18,0x80,0x17,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xy v4, off, s[12:15], m0 format:47 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x23,0xc4,0x04,0x18,0x80,0x17,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xy v4, off, s[8:11], s0 format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_SINT] offset:8388607
-// GFX12: encoding: [0x00,0x40,0x23,0xc4,0x04,0x10,0x80,0x17,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xy v4, off, s[8:11], s61 format:[BUF_FMT_32_32_UINT] offset:8388607
-// GFX12: encoding: [0x3d,0x40,0x23,0xc4,0x04,0x10,0x00,0x18,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], s61 format:48 offset:8388607
-// GFX12: encoding: [0x3d,0x40,0x23,0xc4,0x04,0xe0,0x00,0x18,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xy v4, v1, s[8:11], s3 format:[BUF_DATA_FORMAT_32_32, BUF_NUM_FORMAT_UINT] offen offset:52
-// GFX12: encoding: [0x03,0x40,0x23,0xc4,0x04,0x10,0x00,0x58,0x01,0x34,0x00,0x00]
-
-tbuffer_store_format_d16_xy v4, v1, s[8:11], s3 format:[BUF_FMT_32_32_SINT] idxen offset:52
-// GFX12: encoding: [0x03,0x40,0x23,0xc4,0x04,0x10,0x80,0x98,0x01,0x34,0x00,0x00]
-
-tbuffer_store_format_d16_xy v4, v[1:2], s[8:11], s0 format:49 idxen offen offset:52
-// GFX12: encoding: [0x00,0x40,0x23,0xc4,0x04,0x10,0x80,0xd8,0x01,0x34,0x00,0x00]
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_32_32, BUF_NUM_FORMAT_SINT] offset:8388607 th:TH_STORE_NT_HT scope:SCOPE_DEV
-// GFX12: encoding: [0x03,0x40,0x23,0xc4,0x04,0xe0,0xe8,0x18,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], s3 format:[BUF_FMT_32_32_FLOAT] offset:8388607 th:TH_STORE_BYPASS scope:SCOPE_SYS
-// GFX12: encoding: [0x03,0x40,0x23,0xc4,0x04,0xe0,0x3c,0x19,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], s3 format:50 offset:8388607 scope:SCOPE_SE
-// GFX12: encoding: [0x03,0x40,0x23,0xc4,0x04,0xe0,0x04,0x19,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], 0 format:[BUF_DATA_FORMAT_32_32, BUF_NUM_FORMAT_FLOAT] offset:8388607
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_32_32, BUF_NUM_FORMAT_FLOAT] offset:8388607 glc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_32_32, BUF_NUM_FORMAT_FLOAT] offset:8388607 slc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_32_32, BUF_NUM_FORMAT_FLOAT] offset:8388607 dlc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12: tbuffer_store_d16_format_xy v4, off, s[8:11], s3 format:[BUF_FMT_8_8_8_8_UINT] offset:8388607 ; encoding: [0x03,0x40,0x23,0xc4,0x04,0x10,0x00,0x17,0x00,0xff,0xff,0x7f]
 
 tbuffer_store_format_d16_xyz v[4:5], off, s[8:11], s3 format:[BUF_FMT_16_16_16_16_UNORM] offset:8388607
-// GFX12: encoding: [0x03,0x80,0x23,0xc4,0x04,0x10,0x80,0x19,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyz v[254:255], off, s[8:11], s3 format:51 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x23,0xc4,0xfe,0x10,0x80,0x19,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyz v[4:5], off, s[12:15], s3 format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_UNORM] offset:8388607
-// GFX12: encoding: [0x03,0x80,0x23,0xc4,0x04,0x18,0x80,0x19,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyz v[4:5], off, s[12:15], s101 format:[BUF_FMT_16_16_16_16_SNORM] offset:8388607
-// GFX12: encoding: [0x65,0x80,0x23,0xc4,0x04,0x18,0x00,0x1a,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyz v[4:5], off, s[12:15], m0 format:52 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x23,0xc4,0x04,0x18,0x00,0x1a,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyz v[4:5], off, s[8:11], s0 format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_SNORM] offset:8388607
-// GFX12: encoding: [0x00,0x80,0x23,0xc4,0x04,0x10,0x00,0x1a,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyz v[4:5], off, s[8:11], s61 format:[BUF_FMT_16_16_16_16_USCALED] offset:8388607
-// GFX12: encoding: [0x3d,0x80,0x23,0xc4,0x04,0x10,0x80,0x1a,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], s61 format:53 offset:8388607
-// GFX12: encoding: [0x3d,0x80,0x23,0xc4,0x04,0xe0,0x80,0x1a,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyz v[4:5], v1, s[8:11], s3 format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_USCALED] offen offset:52
-// GFX12: encoding: [0x03,0x80,0x23,0xc4,0x04,0x10,0x80,0x5a,0x01,0x34,0x00,0x00]
-
-tbuffer_store_format_d16_xyz v[4:5], v1, s[8:11], s3 format:[BUF_FMT_16_16_16_16_SSCALED] idxen offset:52
-// GFX12: encoding: [0x03,0x80,0x23,0xc4,0x04,0x10,0x00,0x9b,0x01,0x34,0x00,0x00]
-
-tbuffer_store_format_d16_xyz v[4:5], v[1:2], s[8:11], s0 format:54 idxen offen offset:52
-// GFX12: encoding: [0x00,0x80,0x23,0xc4,0x04,0x10,0x00,0xdb,0x01,0x34,0x00,0x00]
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_SSCALED] offset:8388607 th:TH_STORE_NT_HT scope:SCOPE_DEV
-// GFX12: encoding: [0x03,0x80,0x23,0xc4,0x04,0xe0,0x68,0x1b,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], s3 format:[BUF_FMT_16_16_16_16_UINT] offset:8388607 th:TH_STORE_BYPASS scope:SCOPE_SYS
-// GFX12: encoding: [0x03,0x80,0x23,0xc4,0x04,0xe0,0xbc,0x1b,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], s3 format:55 offset:8388607 scope:SCOPE_SE
-// GFX12: encoding: [0x03,0x80,0x23,0xc4,0x04,0xe0,0x84,0x1b,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], 0 format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_UINT] offset:8388607
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_UINT] offset:8388607 glc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_UINT] offset:8388607 slc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_UINT] offset:8388607 dlc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12: tbuffer_store_d16_format_xyz v[4:5], off, s[8:11], s3 format:[BUF_FMT_16_16_16_16_UNORM] offset:8388607 ; encoding: [0x03,0x80,0x23,0xc4,0x04,0x10,0x80,0x19,0x00,0xff,0xff,0x7f]
 
 tbuffer_store_format_d16_xyzw v[4:5], off, s[8:11], s3 format:[BUF_FMT_16_16_16_16_SINT] offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x23,0xc4,0x04,0x10,0x00,0x1c,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyzw v[254:255], off, s[8:11], s3 format:56 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x23,0xc4,0xfe,0x10,0x00,0x1c,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, s[12:15], s3 format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_SINT] offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x23,0xc4,0x04,0x18,0x00,0x1c,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, s[12:15], s101 format:[BUF_FMT_16_16_16_16_FLOAT] offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x23,0xc4,0x04,0x18,0x80,0x1c,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, s[12:15], m0 format:57 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x23,0xc4,0x04,0x18,0x80,0x1c,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, s[8:11], s0 format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_FLOAT] offset:8388607
-// GFX12: encoding: [0x00,0xc0,0x23,0xc4,0x04,0x10,0x80,0x1c,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, s[8:11], s61 format:[BUF_FMT_32_32_32_UINT] offset:8388607
-// GFX12: encoding: [0x3d,0xc0,0x23,0xc4,0x04,0x10,0x00,0x1d,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], s61 format:58 offset:8388607
-// GFX12: encoding: [0x3d,0xc0,0x23,0xc4,0x04,0xe0,0x00,0x1d,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyzw v[4:5], v1, s[8:11], s3 format:[BUF_DATA_FORMAT_32_32_32, BUF_NUM_FORMAT_UINT] offen offset:52
-// GFX12: encoding: [0x03,0xc0,0x23,0xc4,0x04,0x10,0x00,0x5d,0x01,0x34,0x00,0x00]
-
-tbuffer_store_format_d16_xyzw v[4:5], v1, s[8:11], s3 format:[BUF_FMT_32_32_32_SINT] idxen offset:52
-// GFX12: encoding: [0x03,0xc0,0x23,0xc4,0x04,0x10,0x80,0x9d,0x01,0x34,0x00,0x00]
-
-tbuffer_store_format_d16_xyzw v[4:5], v[1:2], s[8:11], s0 format:59 idxen offen offset:52
-// GFX12: encoding: [0x00,0xc0,0x23,0xc4,0x04,0x10,0x80,0xdd,0x01,0x34,0x00,0x00]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_32_32_32, BUF_NUM_FORMAT_SINT] offset:8388607 th:TH_STORE_NT_HT scope:SCOPE_DEV
-// GFX12: encoding: [0x03,0xc0,0x23,0xc4,0x04,0xe0,0xe8,0x1d,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], s3 format:[BUF_FMT_32_32_32_FLOAT] offset:8388607 th:TH_STORE_BYPASS scope:SCOPE_SYS
-// GFX12: encoding: [0x03,0xc0,0x23,0xc4,0x04,0xe0,0x3c,0x1e,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], s3 format:60 offset:8388607 scope:SCOPE_SE
-// GFX12: encoding: [0x03,0xc0,0x23,0xc4,0x04,0xe0,0x04,0x1e,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], 0 format:[BUF_DATA_FORMAT_32_32_32, BUF_NUM_FORMAT_FLOAT] offset:8388607
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_32_32_32, BUF_NUM_FORMAT_FLOAT] offset:8388607 glc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_32_32_32, BUF_NUM_FORMAT_FLOAT] offset:8388607 slc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_32_32_32, BUF_NUM_FORMAT_FLOAT] offset:8388607 dlc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12: tbuffer_store_d16_format_xyzw v[4:5], off, s[8:11], s3 format:[BUF_FMT_16_16_16_16_SINT] offset:8388607 ; encoding: [0x03,0xc0,0x23,0xc4,0x04,0x10,0x00,0x1c,0x00,0xff,0xff,0x7f]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_alias.s
index 6a8a4ef567b9..7363750900c0 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_alias.s
@@ -1,2214 +1,193 @@
-// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
-// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX12-ERR --implicit-check-not=error: %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
 
 buffer_load_dword v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dword v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x05,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dword v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x05,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dword v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x05,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dword v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dword v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dword v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x05,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_dword v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x05,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_dword v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x00,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_dword v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x00,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_dword v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x00,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_load_dword v5, off, s[8:11], s3 offset:8388607 lds
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12: buffer_load_b32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x05,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x05,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x05,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x05,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x05,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_load_b64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx3 v[253:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x05,0xc4,0xfd,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx3 v[5:7], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x05,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx3 v[5:7], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x05,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx3 v[5:7], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x05,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx3 v[5:7], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x05,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_load_b96 v[5:7], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx4 v[252:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x05,0xc4,0xfc,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx4 v[5:8], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x05,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx4 v[5:8], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x05,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx4 v[5:8], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x05,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx4 v[5:8], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x05,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s3
-// GFX12: encoding: [0x03,0xc0,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0xc0,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0xc0,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_load_b128 v[5:8], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_short_d16 v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16 v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x08,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16 v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x08,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16 v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x08,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16 v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16 v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16 v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x08,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16 v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x08,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16 v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x00,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_short_d16 v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x00,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_short_d16 v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x00,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_load_d16_b16 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_format_d16_x v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_x v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x02,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_x v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x02,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_x v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x02,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_x v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_x v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_x v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x02,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_x v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x02,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_x v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x00,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_format_d16_x v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x00,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_format_d16_x v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x00,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_load_d16_format_x v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_format_d16_xy v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xy v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x02,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xy v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x02,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xy v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x02,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xy v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xy v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x02,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xy v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x02,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_load_d16_format_xy v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyz v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x02,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyz v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x02,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyz v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x02,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyz v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x02,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyz v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x02,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_load_d16_format_xyz v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyzw v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x02,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x02,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x02,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyzw v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x02,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyzw v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x02,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0xc0,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0xc0,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0xc0,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_load_d16_format_xyzw v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_short_d16_hi v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16_hi v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x08,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16_hi v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x08,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16_hi v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x08,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16_hi v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16_hi v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x08,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16_hi v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x08,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0xc0,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0xc0,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0xc0,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_load_d16_hi_b16 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_format_d16_hi_x v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x09,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_hi_x v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x09,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_hi_x v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x09,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_hi_x v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x09,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x09,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x09,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_hi_x v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x09,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_hi_x v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x09,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x09,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x09,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x09,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_load_d16_hi_format_x v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x09,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_sbyte_d16_hi v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16_hi v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x08,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16_hi v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x08,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16_hi v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x08,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16_hi v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x08,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16_hi v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x08,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_load_d16_hi_i8 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_ubyte_d16_hi v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16_hi v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x08,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16_hi v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x08,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16_hi v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x08,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16_hi v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x08,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16_hi v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x08,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_load_d16_hi_u8 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_sbyte_d16 v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16 v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x07,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16 v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x07,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16 v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x07,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16 v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x07,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16 v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x07,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0xc0,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0xc0,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0xc0,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_load_d16_i8 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_ubyte_d16 v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16 v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x07,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16 v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x07,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16 v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x07,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16 v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x07,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16 v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x07,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_load_d16_u8 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_sbyte v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x04,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x04,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x04,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x04,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x04,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_sbyte v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_sbyte v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_load_i8 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_sshort v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sshort v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x04,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sshort v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x04,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sshort v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x04,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sshort v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sshort v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sshort v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x04,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_sshort v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x04,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_sshort v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0xc0,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_sshort v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0xc0,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_sshort v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0xc0,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_load_i16 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_ubyte v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x04,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x04,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x04,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x04,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x04,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x00,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_ubyte v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x00,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_ubyte v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x00,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_load_u8 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_ushort v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ushort v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x04,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ushort v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x04,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ushort v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x04,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ushort v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ushort v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ushort v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x04,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_ushort v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x04,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_ushort v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_ushort v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_ushort v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
+// GFX12: buffer_load_u16 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_byte v1, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_byte v255, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x06,0xc4,0xff,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_byte v1, off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x06,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_byte v1, off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x06,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_byte v1, off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_byte v1, off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_byte v1, v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0x00,0x06,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_byte v1, v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0x00,0x06,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_byte v1, off, s[12:15], s4
-// GFX12: encoding: [0x04,0x00,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_byte v1, off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0x00,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_byte v1, off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0x00,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_store_b8 v1, off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0x00,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_short v1, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_short v255, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x06,0xc4,0xff,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_short v1, off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x06,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_short v1, off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x06,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_short v1, off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_short v1, off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_short v1, v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0x40,0x06,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_short v1, v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0x40,0x06,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_short v1, off, s[12:15], s4
-// GFX12: encoding: [0x04,0x40,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_short v1, off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0x40,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_short v1, off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0x40,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_store_b16 v1, off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0x40,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_dword v1, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x80,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dword v255, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x80,0x06,0xc4,0xff,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dword v1, off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0x80,0x06,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dword v1, off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0x80,0x06,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dword v1, off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dword v1, off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dword v1, v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0x80,0x06,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_dword v1, v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0x80,0x06,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_dword v1, off, s[12:15], s4
-// GFX12: encoding: [0x04,0x80,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_dword v1, off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0x80,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_dword v1, off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0x80,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_store_b32 v1, off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0x80,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx2 v[254:255], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x06,0xc4,0xfe,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx2 v[1:2], off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x06,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx2 v[1:2], off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x06,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx2 v[1:2], v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x06,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx2 v[1:2], v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x06,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s4
-// GFX12: encoding: [0x04,0xc0,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0xc0,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0xc0,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_store_b64 v[1:2], off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0xc0,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx3 v[253:255], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x07,0xc4,0xfd,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx3 v[1:3], off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x07,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx3 v[1:3], off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x07,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx3 v[1:3], v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0x00,0x07,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx3 v[1:3], v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0x00,0x07,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s4
-// GFX12: encoding: [0x04,0x00,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0x00,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0x00,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_store_b96 v[1:3], off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0x00,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx4 v[252:255], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x07,0xc4,0xfc,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx4 v[1:4], off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x07,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx4 v[1:4], off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x07,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx4 v[1:4], v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0x40,0x07,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx4 v[1:4], v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0x40,0x07,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s4
-// GFX12: encoding: [0x04,0x40,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0x40,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0x40,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_store_b128 v[1:4], off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0x40,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_format_d16_x v1, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_x v255, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x03,0xc4,0xff,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_x v1, off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x03,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_x v1, off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x03,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_x v1, off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_x v1, off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_x v1, v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0x00,0x03,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_x v1, v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0x00,0x03,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_x v1, off, s[12:15], s4
-// GFX12: encoding: [0x04,0x00,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_format_d16_x v1, off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0x00,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_format_d16_x v1, off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0x00,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_store_d16_format_x v1, off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0x00,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_format_d16_xy v1, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xy v255, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x03,0xc4,0xff,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xy v1, off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x03,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xy v1, off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x03,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xy v1, off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xy v1, v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0x40,0x03,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xy v1, v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0x40,0x03,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s4
-// GFX12: encoding: [0x04,0x40,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0x40,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0x40,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_store_d16_format_xy v1, off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0x40,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x80,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyz v[254:255], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x80,0x03,0xc4,0xfe,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyz v[1:2], off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0x80,0x03,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyz v[1:2], off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0x80,0x03,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyz v[1:2], v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0x80,0x03,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyz v[1:2], v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0x80,0x03,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4
-// GFX12: encoding: [0x04,0x80,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0x80,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0x80,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_store_d16_format_xyz v[1:2], off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0x80,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyzw v[254:255], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x03,0xc4,0xfe,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x03,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x03,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyzw v[1:2], v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x03,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyzw v[1:2], v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x03,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4
-// GFX12: encoding: [0x04,0xc0,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0xc0,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0xc0,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_store_d16_format_xyzw v[1:2], off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0xc0,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_byte_d16_hi v1, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_byte_d16_hi v255, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x09,0xc4,0xff,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_byte_d16_hi v1, off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x09,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_byte_d16_hi v1, off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x09,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_byte_d16_hi v1, v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0x00,0x09,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_byte_d16_hi v1, v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0x00,0x09,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s4
-// GFX12: encoding: [0x04,0x00,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0x00,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0x00,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_store_d16_hi_b8 v1, off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0x00,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_short_d16_hi v1, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_short_d16_hi v255, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x09,0xc4,0xff,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_short_d16_hi v1, off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x09,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_short_d16_hi v1, off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x09,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_short_d16_hi v1, off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_short_d16_hi v1, v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0x40,0x09,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_short_d16_hi v1, v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0x40,0x09,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s4
-// GFX12: encoding: [0x04,0x40,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0x40,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0x40,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_store_d16_hi_b16 v1, off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0x40,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_format_d16_hi_x v1, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_hi_x v255, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x09,0xc4,0xff,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_hi_x v1, off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x09,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_hi_x v1, off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x09,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_hi_x v1, v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x09,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_hi_x v1, v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x09,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s4
-// GFX12: encoding: [0x04,0xc0,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0xc0,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0xc0,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
+// GFX12: buffer_store_d16_hi_format_x v1, off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0xc0,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_add v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0d,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0d,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0d,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0d,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0d,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_add v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_add v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_add v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x40,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_add_u32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add_x2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x10,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add_x2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x10,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add_x2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x10,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add_x2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x10,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add_x2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x10,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0xc0,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0xc0,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0xc0,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x10,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_add_u64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_and v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0f,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0f,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0f,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0f,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0f,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x00,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_and v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x00,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_and v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x00,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_and v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x00,0x0f,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_and_b32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and_x2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x12,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and_x2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x12,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and_x2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x12,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and_x2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x12,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and_x2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x12,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x40,0x12,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_and_b64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0d,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0d,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0d,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0d,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0d,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x00,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x00,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x00,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x00,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_cmpswap_b32 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap_x2 v[252:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x10,0xc4,0xfc,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x10,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x10,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap_x2 v[5:8], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x10,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap_x2 v[5:8], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x10,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x80,0x10,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_cmpswap_b64 v[5:8], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_csub v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub v255, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0xff,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub v5, off, s[12:15], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x18,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub v5, off, s[96:99], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0xc0,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub v5, off, s[8:11], s101 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x65,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub v5, off, s[8:11], m0 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x7d,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub v5, v0, s[8:11], s3 idxen offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub v5, v0, s[8:11], s3 offen offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub v5, off, s[8:11], s3 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_csub v5, off, s[8:11], s3 offset:0 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_csub v5, off, s[8:11], s3 offset:7 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_csub v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_sub_clamp_u32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_csub_u32 v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub_u32 v255, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0xff,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub_u32 v5, off, s[12:15], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x18,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub_u32 v5, off, s[96:99], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0xc0,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub_u32 v5, off, s[8:11], s101 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x65,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub_u32 v5, off, s[8:11], m0 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x7d,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub_u32 v5, v0, s[8:11], s3 idxen offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub_u32 v5, v0, s[8:11], s3 offen offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub_u32 v5, off, s[8:11], s3 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_csub_u32 v5, off, s[8:11], s3 offset:0 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_csub_u32 v5, off, s[8:11], s3 offset:7 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_csub_u32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_sub_clamp_u32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_dec v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x10,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x10,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x10,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x10,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x10,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x00,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_dec v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x00,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_dec v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x00,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_dec v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x00,0x10,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_dec_u32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec_x2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x13,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec_x2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x13,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec_x2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x13,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec_x2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x13,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec_x2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x13,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x40,0x13,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_dec_u64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_inc v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0f,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0f,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0f,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0f,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0f,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0xc0,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_inc v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0xc0,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_inc v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0xc0,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_inc v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0f,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_inc_u32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc_x2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x13,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc_x2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x13,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc_x2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x13,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc_x2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x13,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc_x2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x13,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x00,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x00,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x00,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x00,0x13,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_inc_u64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_fmax v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmax v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmax v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmax v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmax v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmax v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmax v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmax v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmax v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_fmax v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_fmax v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_fmax v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_max_num_f32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_max_f32 v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_max_f32 v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_max_f32 v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_max_f32 v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_max_f32 v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_max_f32 v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_max_f32 v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_max_f32 v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_max_f32 v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_max_f32 v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_max_f32 v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_max_f32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_max_num_f32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_smax v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0e,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0e,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0e,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0e,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0e,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_smax v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_smax v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_smax v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x80,0x0e,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_max_i32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax_x2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x11,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax_x2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x11,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax_x2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x11,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax_x2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x11,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax_x2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x11,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0xc0,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0xc0,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0xc0,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x11,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_max_i64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_umax v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0e,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0e,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0e,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0e,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0e,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0xc0,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_umax v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0xc0,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_umax v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0xc0,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_umax v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0e,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_max_u32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax_x2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x12,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax_x2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x12,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax_x2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x12,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax_x2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x12,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax_x2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x12,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x00,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x00,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x00,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x00,0x12,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_max_u64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_fmin v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmin v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmin v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmin v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmin v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmin v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmin v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmin v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmin v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_fmin v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_fmin v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_fmin v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_min_num_f32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_min_f32 v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_min_f32 v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_min_f32 v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_min_f32 v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_min_f32 v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_min_f32 v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_min_f32 v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_min_f32 v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_min_num_f32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_smin v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0e,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0e,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0e,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0e,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0e,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x00,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_smin v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x00,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_smin v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x00,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_smin v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x00,0x0e,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_min_i32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin_x2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x11,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin_x2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x11,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin_x2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x11,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin_x2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x11,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin_x2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x11,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x40,0x11,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_min_i64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_umin v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0e,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0e,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0e,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0e,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0e,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_umin v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_umin v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_umin v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x40,0x0e,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_min_u32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin_x2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x11,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin_x2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x11,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin_x2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x11,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin_x2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x11,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin_x2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x11,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x80,0x11,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_min_u64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_or v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0f,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0f,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0f,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0f,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0f,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_or v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_or v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_or v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x40,0x0f,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_or_b32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or_x2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x12,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or_x2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x12,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or_x2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x12,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or_x2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x12,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or_x2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x12,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x80,0x12,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_or_b64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_sub v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0d,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0d,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0d,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0d,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0d,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_sub v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_sub v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_sub v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x80,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_sub_u32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub_x2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x11,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub_x2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x11,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub_x2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x11,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub_x2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x11,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub_x2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x11,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x00,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x00,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x00,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x00,0x11,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_sub_u64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_swap v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0c,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0c,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0c,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0c,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x0c,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x0c,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0c,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0c,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0xc0,0x0c,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_swap v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0xc0,0x0c,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_swap v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0xc0,0x0c,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_swap v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0c,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_swap_b32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x0c,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap_x2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x10,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap_x2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x10,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap_x2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x10,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap_x2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x10,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap_x2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x10,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x40,0x10,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_swap_b64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_xor v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0f,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0f,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0f,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0f,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0f,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_xor v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_xor v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_xor v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x80,0x0f,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_xor_b32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor_x2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x12,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor_x2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x12,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor_x2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x12,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor_x2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x12,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor_x2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x12,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0xc0,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0xc0,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0xc0,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x12,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_xor_b64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vflat_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_vflat_alias.s
index 3633f8815c00..df453d96117a 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vflat_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vflat_alias.s
@@ -1,43 +1,34 @@
 ; RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck -check-prefix=GFX12 %s
 
 global_atomic_csub v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN
-// GFX12: global_atomic_sub_clamp_u32 {{.*}} encoding: [0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
-
-global_atomic_csub v0, v2, s[0:1] offset:64
-// GFX12: global_atomic_sub_clamp_u32 {{.*}} encoding: [0x00,0xc0,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+// GFX12: global_atomic_sub_clamp_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
 
 global_atomic_csub_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN
-// GFX12: global_atomic_sub_clamp_u32 {{.*}} encoding: [0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
-
-global_atomic_csub_u32 v0, v2, s[0:1] offset:64
-// GFX12: global_atomic_sub_clamp_u32 {{.*}} encoding: [0x00,0xc0,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+// GFX12: global_atomic_sub_clamp_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
 
 flat_atomic_csub_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN
-// GFX12: flat_atomic_sub_clamp_u32 {{.*}} encoding: [0x7c,0xc0,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
-
-flat_atomic_csub_u32 v[0:1], v2 offset:64
-// GFX12: flat_atomic_sub_clamp_u32 {{.*}} encoding: [0x7c,0xc0,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+// GFX12: flat_atomic_sub_clamp_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
 
 flat_atomic_fmax v[0:1], v2 offset:64
-// GFX12: flat_atomic_max_num_f32 {{.*}} encoding: [0x7c,0x80,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+// GFX12: flat_atomic_max_num_f32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x80,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
 
 flat_atomic_max_f32 v[0:1], v2 offset:64
-// GFX12: flat_atomic_max_num_f32 {{.*}} encoding: [0x7c,0x80,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+// GFX12: flat_atomic_max_num_f32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x80,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
 
 flat_atomic_fmin v[0:1], v2 offset:64
-// GFX12: flat_atomic_min_num_f32 {{.*}} encoding: [0x7c,0x40,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+// GFX12: flat_atomic_min_num_f32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x40,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
 
 flat_atomic_min_f32 v[0:1], v2 offset:64
-// GFX12: flat_atomic_min_num_f32 {{.*}} encoding: [0x7c,0x40,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+// GFX12: flat_atomic_min_num_f32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x40,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
 
 global_atomic_fmax v0, v2, s[0:1] offset:64
-// GFX12: global_atomic_max_num_f32 {{.*}} encoding: [0x00,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+// GFX12: global_atomic_max_num_f32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
 
 global_atomic_max_f32 v0, v2, s[0:1] offset:64
-// GFX12: global_atomic_max_num_f32 {{.*}} encoding: [0x00,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+// GFX12: global_atomic_max_num_f32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
 
 global_atomic_fmin v0, v2, s[0:1] offset:64
-// GFX12: global_atomic_min_num_f32 {{.*}} encoding: [0x00,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+// GFX12: global_atomic_min_num_f32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
 
 global_atomic_min_f32 v0, v2, s[0:1] offset:64
-// GFX12: global_atomic_min_num_f32 {{.*}} encoding: [0x00,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+// GFX12: global_atomic_min_num_f32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s
index 9211877774aa..f2ae7dd8c35a 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s
@@ -43,7 +43,7 @@ v_min_f64 v[5:6], s[2:3], s[4:5]
 // GFX12: v_min_num_f64_e64 v[5:6], s[2:3], s[4:5] ; encoding: [0x05,0x00,0x0d,0xd5,0x02,0x08,0x00,0x00]
 
 v_cvt_pknorm_i16_f16 v5, v1, v2
-// GFX11: v_cvt_pk_norm_i16_f16 {{.*}} encoding: [0x05,0x00,0x12,0xd7,0x01,0x05,0x02,0x00]
+// GFX12: v_cvt_pk_norm_i16_f16 v5, v1, v2        ; encoding: [0x05,0x00,0x12,0xd7,0x01,0x05,0x02,0x00]
 
 v_cvt_pknorm_u16_f16 v5, v1, v2
-// GFX11: v_cvt_pk_norm_u16_f16 {{.*}} encoding: [0x05,0x00,0x13,0xd7,0x01,0x05,0x02,0x00]
+// GFX12: v_cvt_pk_norm_u16_f16 v5, v1, v2        ; encoding: [0x05,0x00,0x13,0xd7,0x01,0x05,0x02,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx940_asm_features.s b/llvm/test/MC/AMDGPU/gfx940_asm_features.s
index 5ee9480677be..e208b6cf903d 100644
--- a/llvm/test/MC/AMDGPU/gfx940_asm_features.s
+++ b/llvm/test/MC/AMDGPU/gfx940_asm_features.s
@@ -197,23 +197,23 @@ scratch_load_lds_ushort v2, off
 // GFX940: scratch_load_lds_sshort v2, off         ; encoding: [0x00,0x60,0xa4,0xdc,0x02,0x00,0x7f,0x00]
 scratch_load_lds_sshort v2, off
 
-// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // GFX940: s_getreg_b32 s1, hwreg(HW_REG_XCC_ID)   ; encoding: [0x14,0xf8,0x81,0xb8]
 s_getreg_b32 s1, hwreg(HW_REG_XCC_ID)
 
-// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // GFX940: s_getreg_b32 s1, hwreg(HW_REG_SQ_PERF_SNAPSHOT_DATA) ; encoding: [0x15,0xf8,0x81,0xb8]
 s_getreg_b32 s1, hwreg(HW_REG_SQ_PERF_SNAPSHOT_DATA)
 
-// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // GFX940: s_getreg_b32 s1, hwreg(HW_REG_SQ_PERF_SNAPSHOT_DATA1) ; encoding: [0x16,0xf8,0x81,0xb8]
 s_getreg_b32 s1, hwreg(HW_REG_SQ_PERF_SNAPSHOT_DATA1)
 
-// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // GFX940: s_getreg_b32 s1, hwreg(HW_REG_SQ_PERF_SNAPSHOT_PC_LO) ; encoding: [0x17,0xf8,0x81,0xb8]
 s_getreg_b32 s1, hwreg(HW_REG_SQ_PERF_SNAPSHOT_PC_LO)
 
-// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // GFX940: s_getreg_b32 s1, hwreg(HW_REG_SQ_PERF_SNAPSHOT_PC_HI) ; encoding: [0x18,0xf8,0x81,0xb8]
 s_getreg_b32 s1, hwreg(HW_REG_SQ_PERF_SNAPSHOT_PC_HI)
 
diff --git a/llvm/test/MC/AMDGPU/gfx940_err.s b/llvm/test/MC/AMDGPU/gfx940_err.s
index 515b89513a80..000f3decf960 100644
--- a/llvm/test/MC/AMDGPU/gfx940_err.s
+++ b/llvm/test/MC/AMDGPU/gfx940_err.s
@@ -97,22 +97,22 @@ v_cvt_pk_fp8_f32 v1, v2, v3 mul:2
 // GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
 
 s_getreg_b32 s1, hwreg(HW_REG_FLAT_SCR_LO)
-// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 s_getreg_b32 s1, hwreg(HW_REG_FLAT_SCR_HI)
-// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 s_getreg_b32 s1, hwreg(HW_REG_XNACK_MASK)
-// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 s_getreg_b32 s1, hwreg(HW_REG_HW_ID1)
-// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 s_getreg_b32 s1, hwreg(HW_REG_HW_ID2)
-// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 s_getreg_b32 s1, hwreg(HW_REG_POPS_PACKER)
-// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 ds_ordered_count v5, v1 offset:65535 gds
 // GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/sopk-err.s b/llvm/test/MC/AMDGPU/sopk-err.s
index 504ee1d11cbc..cd92343b0e7f 100644
--- a/llvm/test/MC/AMDGPU/sopk-err.s
+++ b/llvm/test/MC/AMDGPU/sopk-err.s
@@ -5,48 +5,127 @@
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -show-encoding %s | FileCheck -check-prefix=GFX10 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck -check-prefix=GFX11 %s
 
-// RUN: not llvm-mc -triple=amdgcn %s 2>&1 | FileCheck -check-prefixes=GCN,SICIVI-ERR --implicit-check-not=error: %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=tahiti %s 2>&1 | FileCheck -check-prefixes=GCN,SICIVI-ERR --implicit-check-not=error: %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=tonga %s 2>&1 | FileCheck -check-prefixes=GCN,SICIVI-ERR --implicit-check-not=error: %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx900 %s 2>&1 | FileCheck -check-prefixes=GCN,GFX9-ERR --implicit-check-not=error: %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck -check-prefixes=GCN,GFX10-ERR --implicit-check-not=error: %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck -check-prefixes=GCN,GFX11-ERR --implicit-check-not=error: %s
+// RUN: not llvm-mc -triple=amdgcn %s 2>&1 | FileCheck -check-prefixes=GCN,SICIVI-ERR --implicit-check-not=error: --strict-whitespace %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=tahiti %s 2>&1 | FileCheck -check-prefixes=GCN,SICIVI-ERR --implicit-check-not=error: --strict-whitespace %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=tonga %s 2>&1 | FileCheck -check-prefixes=GCN,SICIVI-ERR --implicit-check-not=error: --strict-whitespace %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx900 %s 2>&1 | FileCheck -check-prefixes=GCN,GFX9-ERR --implicit-check-not=error: --strict-whitespace %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck -check-prefixes=GCN,GFX10-ERR --implicit-check-not=error: --strict-whitespace %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck -check-prefixes=GCN,GFX11-ERR --implicit-check-not=error: --strict-whitespace %s
 
 s_setreg_b32  0x1f803, s2
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid immediate: only 16-bit values are legal
+// GCN-NEXT: {{^}}s_setreg_b32  0x1f803, s2
+// GCN-NEXT: {{^}}              ^
 
 s_setreg_b32  typo(0x40), s2
-// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: expected a hwreg macro or an absolute expression
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: expected a hwreg macro, structured immediate or an absolute expression
+// GCN-NEXT: {{^}}s_setreg_b32  typo(0x40), s2
+// GCN-NEXT: {{^}}              ^
 
 s_setreg_b32  hwreg(0x40), s2
-// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid code of hardware register: only 6-bit values are legal
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: only 6-bit values are legal
+// GCN-NEXT: {{^}}s_setreg_b32  hwreg(0x40), s2
+// GCN-NEXT: {{^}}                    ^
+
+s_setreg_b32  {id: 0x40}, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: only 6-bit values are legal
+// GCN-NEXT: {{^}}s_setreg_b32  {id: 0x40}, s2
+// GCN-NEXT: {{^}}                   ^
 
 s_setreg_b32  hwreg(HW_REG_WRONG), s2
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: expected a register name or an absolute expression
+// GCN-NEXT: {{^}}s_setreg_b32  hwreg(HW_REG_WRONG), s2
+// GCN-NEXT: {{^}}                    ^
 
 s_setreg_b32  hwreg(1 2,3), s2
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: expected a comma or a closing parenthesis
+// GCN-NEXT: {{^}}s_setreg_b32  hwreg(1 2,3), s2
+// GCN-NEXT: {{^}}                      ^
 
 s_setreg_b32  hwreg(1,2 3), s2
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: expected a comma
+// GCN-NEXT: {{^}}s_setreg_b32  hwreg(1,2 3), s2
+// GCN-NEXT: {{^}}                        ^
 
 s_setreg_b32  hwreg(1,2,3, s2
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: expected a closing parenthesis
+// GCN-NEXT: {{^}}s_setreg_b32  hwreg(1,2,3, s2
+// GCN-NEXT: {{^}}                         ^
+
+s_setreg_b32  {id: 1 offset: 2, size: 3}, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: comma or closing brace expected
+// GCN-NEXT: {{^}}s_setreg_b32  {id: 1 offset: 2, size: 3}, s2
+// GCN-NEXT: {{^}}                     ^
+
+s_setreg_b32  {id: 1 offset: 2, size: 3}, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: comma or closing brace expected
+// GCN-NEXT: {{^}}s_setreg_b32  {id: 1 offset: 2, size: 3}, s2
+// GCN-NEXT: {{^}}                     ^
+
+s_setreg_b32  {id 1, offset: 2, size: 3}, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: colon expected
+// GCN-NEXT: {{^}}s_setreg_b32  {id 1, offset: 2, size: 3}, s2
+// GCN-NEXT: {{^}}                  ^
+
+s_setreg_b32  {id: 1, offset: 2, size: 3, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: colon expected
+// GCN-NEXT: {{^}}s_setreg_b32  {id: 1, offset: 2, size: 3, s2
+// GCN-NEXT: {{^}}                                            ^
+
+s_setreg_b32  {id: 1, offset: 2, blah: 3}, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: unknown field
+// GCN-NEXT: {{^}}s_setreg_b32  {id: 1, offset: 2, blah: 3}, s2
+// GCN-NEXT: {{^}}                                 ^
+
+s_setreg_b32  {id: 1, id: 2}, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: duplicate field
+// GCN-NEXT: {{^}}s_setreg_b32  {id: 1, id: 2}, s2
+// GCN-NEXT: {{^}}                      ^
 
 s_setreg_b32  hwreg(3,32,32), s2
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid bit offset: only 5-bit values are legal
+// GCN-NEXT: {{^}}s_setreg_b32  hwreg(3,32,32), s2
+// GCN-NEXT: {{^}}                      ^
+
+s_setreg_b32  {id: 3, offset: 32, size: 32}, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid bit offset: only 5-bit values are legal
+// GCN-NEXT: {{^}}s_setreg_b32  {id: 3, offset: 32, size: 32}, s2
+// GCN-NEXT: {{^}}                              ^
 
 s_setreg_b32  hwreg(3,0,33), s2
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid bitfield width: only values from 1 to 32 are legal
+// GCN-NEXT: {{^}}s_setreg_b32  hwreg(3,0,33), s2
+// GCN-NEXT: {{^}}                        ^
+
+s_setreg_b32  {id: 3, offset: 0, size: 33}, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid bitfield width: only values from 1 to 32 are legal
+// GCN-NEXT: {{^}}s_setreg_b32  {id: 3, offset: 0, size: 33}, s2
+// GCN-NEXT: {{^}}                                       ^
 
 s_setreg_imm32_b32  0x1f803, 0xff
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid immediate: only 16-bit values are legal
+// GCN-NEXT: {{^}}s_setreg_imm32_b32  0x1f803, 0xff
+// GCN-NEXT: {{^}}                    ^
 
 s_setreg_imm32_b32  hwreg(3,0,33), 0xff
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid bitfield width: only values from 1 to 32 are legal
+// GCN-NEXT: {{^}}s_setreg_imm32_b32  hwreg(3,0,33), 0xff
+// GCN-NEXT: {{^}}                              ^
+
+s_setreg_imm32_b32  {id: 3, offset: 0, size: 33}, 0xff
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid bitfield width: only values from 1 to 32 are legal
+// GCN-NEXT: {{^}}s_setreg_imm32_b32  {id: 3, offset: 0, size: 33}, 0xff
+// GCN-NEXT: {{^}}                                             ^
 
 s_getreg_b32  s2, hwreg(3,32,32)
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid bit offset: only 5-bit values are legal
+// GCN-NEXT: {{^}}s_getreg_b32  s2, hwreg(3,32,32)
+// GCN-NEXT: {{^}}                          ^
+
+s_getreg_b32  s2, {id: 3, offset: 32, size: 32}
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid bit offset: only 5-bit values are legal
+// GCN-NEXT: {{^}}s_getreg_b32  s2, {id: 3, offset: 32, size: 32}
+// GCN-NEXT: {{^}}                                  ^
 
 s_cbranch_i_fork s[2:3], 0x6
 // SICI: s_cbranch_i_fork s[2:3], 6 ; encoding: [0x06,0x00,0x82,0xb8]
@@ -57,69 +136,109 @@ s_cbranch_i_fork s[2:3], 0x6
 
 s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES)
 // GFX10:  s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES) ; encoding: [0x0f,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES)
+// SICIVI-ERR-NEXT: {{^}}                       ^
 // GFX9: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES) ; encoding: [0x0f,0xf8,0x82,0xb8]
 // GFX11: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES) ; encoding: [0x0f,0xf8,0x82,0xb8]
 
 s_getreg_b32 s2, hwreg(HW_REG_TBA_LO)
 // GFX10:    s_getreg_b32 s2, hwreg(HW_REG_TBA_LO) ; encoding: [0x10,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX11-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_TBA_LO)
+// SICIVI-ERR-NEXT: {{^}}                       ^
+// GFX11-ERR: :[[@LINE-5]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX11-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_TBA_LO)
+// GFX11-ERR-NEXT: {{^}}                       ^
 // GFX9:     s_getreg_b32 s2, hwreg(HW_REG_TBA_LO)   ; encoding: [0x10,0xf8,0x82,0xb8]
 
 s_getreg_b32 s2, hwreg(HW_REG_TBA_HI)
 // GFX10:    s_getreg_b32 s2, hwreg(HW_REG_TBA_HI) ; encoding: [0x11,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX11-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_TBA_HI)
+// SICIVI-ERR-NEXT: {{^}}                       ^
+// GFX11-ERR: :[[@LINE-5]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX11-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_TBA_HI)
+// GFX11-ERR-NEXT: {{^}}                       ^
 // GFX9:     s_getreg_b32 s2, hwreg(HW_REG_TBA_HI)   ; encoding: [0x11,0xf8,0x82,0xb8]
 
 s_getreg_b32 s2, hwreg(HW_REG_TMA_LO)
 // GFX10:    s_getreg_b32 s2, hwreg(HW_REG_TMA_LO) ; encoding: [0x12,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX11-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_TMA_LO)
+// SICIVI-ERR-NEXT: {{^}}                       ^
+// GFX11-ERR: :[[@LINE-5]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX11-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_TMA_LO)
+// GFX11-ERR-NEXT: {{^}}                       ^
 // GFX9:     s_getreg_b32 s2, hwreg(HW_REG_TMA_LO)   ; encoding: [0x12,0xf8,0x82,0xb8]
 
 s_getreg_b32 s2, hwreg(HW_REG_TMA_HI)
 // GFX10:    s_getreg_b32 s2, hwreg(HW_REG_TMA_HI) ; encoding: [0x13,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX11-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_TMA_HI)
+// SICIVI-ERR-NEXT: {{^}}                       ^
+// GFX11-ERR: :[[@LINE-5]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX11-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_TMA_HI)
+// GFX11-ERR-NEXT: {{^}}                       ^
 // GFX9:     s_getreg_b32 s2, hwreg(HW_REG_TMA_HI)   ; encoding: [0x13,0xf8,0x82,0xb8]
 
 s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_LO)
 // GFX10:    s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_LO) ; encoding: [0x14,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX9-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_LO)
+// SICIVI-ERR-NEXT: {{^}}                       ^
+// GFX9-ERR: :[[@LINE-5]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX9-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_LO)
+// GFX9-ERR-NEXT: {{^}}                       ^
 // GFX11: s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_LO) ; encoding: [0x14,0xf8,0x82,0xb8]
 
 s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_HI)
 // GFX10:    s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_HI) ; encoding: [0x15,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX9-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_HI)
+// SICIVI-ERR-NEXT: {{^}}                       ^
+// GFX9-ERR: :[[@LINE-5]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX9-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_HI)
+// GFX9-ERR-NEXT: {{^}}                       ^
 // GFX11: s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_HI) ; encoding: [0x15,0xf8,0x82,0xb8]
 
 s_getreg_b32 s2, hwreg(HW_REG_XNACK_MASK)
 // GFX10:    s_getreg_b32 s2, hwreg(HW_REG_XNACK_MASK) ; encoding: [0x16,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX9-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX11-ERR: :[[@LINE-4]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_XNACK_MASK)
+// SICIVI-ERR-NEXT: {{^}}                       ^
+// GFX9-ERR: :[[@LINE-5]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX11-ERR: :[[@LINE-6]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 s_getreg_b32 s2, hwreg(HW_REG_POPS_PACKER)
 // GFX10:    s_getreg_b32 s2, hwreg(HW_REG_POPS_PACKER) ; encoding: [0x19,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX9-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX11-ERR: :[[@LINE-4]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_POPS_PACKER)
+// SICIVI-ERR-NEXT: {{^}}                       ^
+// GFX9-ERR: :[[@LINE-5]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX11-ERR: :[[@LINE-6]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 s_cmpk_le_u32 s2, -1
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GCN-NEXT: {{^}}s_cmpk_le_u32 s2, -1
+// GCN-NEXT: {{^}}                  ^
 
 s_cmpk_le_u32 s2, 0x1ffff
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GCN-NEXT: {{^}}s_cmpk_le_u32 s2, 0x1ffff
+// GCN-NEXT: {{^}}                  ^
 
 s_cmpk_le_u32 s2, 0x10000
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GCN-NEXT: {{^}}s_cmpk_le_u32 s2, 0x10000
+// GCN-NEXT: {{^}}                  ^
 
 s_mulk_i32 s2, 0xFFFFFFFFFFFF0000
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GCN-NEXT: {{^}}s_mulk_i32 s2, 0xFFFFFFFFFFFF0000
+// GCN-NEXT: {{^}}               ^
 
 s_mulk_i32 s2, 0x10000
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GCN-NEXT: {{^}}s_mulk_i32 s2, 0x10000
+// GCN-NEXT: {{^}}               ^
diff --git a/llvm/test/MC/AMDGPU/sopk.s b/llvm/test/MC/AMDGPU/sopk.s
index 2b20c35aa771..c912b83ca61c 100644
--- a/llvm/test/MC/AMDGPU/sopk.s
+++ b/llvm/test/MC/AMDGPU/sopk.s
@@ -158,6 +158,12 @@ s_getreg_b32 s2, hwreg(51, 1, 31)
 // GFX10: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
 // GFX11: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
 
+s_getreg_b32 s2, {id: 51, offset: 1, size: 31}
+// SICI: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
+// VI9:  s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
+// GFX10: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
+// GFX11: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
+
 // HW register code of unknown HW register, default offset/width
 s_getreg_b32 s2, hwreg(51)
 // SICI: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x02,0xb9]
@@ -165,6 +171,27 @@ s_getreg_b32 s2, hwreg(51)
 // GFX10: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x02,0xb9]
 // GFX11: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x82,0xb8]
 
+// Structured form using default values.
+s_getreg_b32 s2, {id: 51}
+// SICI: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x02,0xb9]
+// VI9:  s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x82,0xb8]
+// GFX10: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x02,0xb9]
+// GFX11: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x82,0xb8]
+
+// Fields may come in any order.
+s_getreg_b32 s2, {size: 32, id: 51}
+// SICI: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x02,0xb9]
+// VI9:  s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x82,0xb8]
+// GFX10: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x02,0xb9]
+// GFX11: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x82,0xb8]
+
+// Empty field lists are allowed.
+s_getreg_b32 s2, {}
+// SICI: s_getreg_b32 s2, hwreg(0) ; encoding: [0x00,0xf8,0x02,0xb9]
+// VI9:  s_getreg_b32 s2, hwreg(0) ; encoding: [0x00,0xf8,0x82,0xb8]
+// GFX10: s_getreg_b32 s2, hwreg(0) ; encoding: [0x00,0xf8,0x02,0xb9]
+// GFX11: s_getreg_b32 s2, hwreg(0) ; encoding: [0x00,0xf8,0x82,0xb8]
+
 // HW register code of unknown HW register, valid symbolic name range but no name available
 s_getreg_b32 s2, hwreg(10)
 // SICI: s_getreg_b32 s2, hwreg(10) ; encoding: [0x0a,0xf8,0x02,0xb9]
@@ -271,17 +298,17 @@ s_setreg_b32 hwreg(HW_REG_HW_ID), s2
 // SICI: s_setreg_b32 hwreg(HW_REG_HW_ID), s2       ; encoding: [0x04,0xf8,0x82,0xb9]
 // VI9:  s_setreg_b32 hwreg(HW_REG_HW_ID), s2       ; encoding: [0x04,0xf8,0x02,0xb9]
 // GFX10: s_setreg_b32 hwreg(HW_REG_HW_ID1), s2   ; encoding: [0x17,0xf8,0x82,0xb9]
-// NOGFX11: :[[@LINE-4]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// NOGFX11: :[[@LINE-4]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 s_setreg_b32 hwreg(HW_REG_HW_ID1), s2
-// NOSICIVI: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// NOGFX9: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// NOSICIVI: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// NOGFX9: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // GFX10: s_setreg_b32 hwreg(HW_REG_HW_ID1), s2      ; encoding: [0x17,0xf8,0x82,0xb9]
 // GFX11: s_setreg_b32 hwreg(HW_REG_HW_ID1), s2 ; encoding: [0x17,0xf8,0x02,0xb9]
 
 s_setreg_b32 hwreg(HW_REG_HW_ID2), s2
-// NOSICIVI: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// NOGFX9: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// NOSICIVI: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// NOGFX9: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // GFX10: s_setreg_b32 hwreg(HW_REG_HW_ID2), s2      ; encoding: [0x18,0xf8,0x82,0xb9]
 // GFX11: s_setreg_b32 hwreg(HW_REG_HW_ID2), s2 ; encoding: [0x18,0xf8,0x02,0xb9]
 
@@ -427,12 +454,24 @@ s_getreg_b32 s2, hwreg(reg + 1, offset - 1, width + 1)
 // GFX10: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
 // GFX11: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
 
+s_getreg_b32 s2, {id: reg + 1, offset: offset - 1, size: width + 1}
+// SICI: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
+// VI9:  s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
+// GFX10: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
+// GFX11: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
+
 s_getreg_b32 s2, hwreg(1 + reg, -1 + offset, 1 + width)
 // SICI: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
 // VI9:  s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
 // GFX10: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
 // GFX11: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
 
+s_getreg_b32 s2, {id: 1 + reg, offset: -1 + offset, size: 1 + width}
+// SICI: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
+// VI9:  s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
+// GFX10: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
+// GFX11: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/MC/ARM/thumbv8m.s b/llvm/test/MC/ARM/thumbv8m.s
index 0e9ab4a9b3bf..f03dd03dae3a 100644
--- a/llvm/test/MC/ARM/thumbv8m.s
+++ b/llvm/test/MC/ARM/thumbv8m.s
@@ -184,13 +184,13 @@ ttat r0, r1
 // 'Lazy Load/Store Multiple'
 
 // UNDEF-BASELINE: error: instruction requires: armv8m.main
-// CHECK-MAINLINE: vlldm r5          @ encoding: [0x35,0xec,0x00,0x0a]
-// CHECK-MAINLINE_DSP: vlldm r5      @ encoding: [0x35,0xec,0x00,0x0a]
+// CHECK-MAINLINE: vlldm r5, {d0 - d15} @ encoding: [0x35,0xec,0x00,0x0a]
+// CHECK-MAINLINE_DSP: vlldm r5, {d0 - d15} @ encoding: [0x35,0xec,0x00,0x0a]
 vlldm r5
 
 // UNDEF-BASELINE: error: instruction requires: armv8m.main
-// CHECK-MAINLINE: vlstm r10         @ encoding: [0x2a,0xec,0x00,0x0a]
-// CHECK-MAINLINE_DSP: vlstm r10     @ encoding: [0x2a,0xec,0x00,0x0a]
+// CHECK-MAINLINE: vlstm r10, {d0 - d15} @ encoding: [0x2a,0xec,0x00,0x0a]
+// CHECK-MAINLINE_DSP: vlstm r10, {d0 - d15} @ encoding: [0x2a,0xec,0x00,0x0a]
 vlstm r10
 
 // New SYSm's
diff --git a/llvm/test/MC/ARM/vlstm-vlldm-8.1m.s b/llvm/test/MC/ARM/vlstm-vlldm-8.1m.s
new file mode 100644
index 000000000000..4e35883ffe43
--- /dev/null
+++ b/llvm/test/MC/ARM/vlstm-vlldm-8.1m.s
@@ -0,0 +1,11 @@
+// RUN: llvm-mc -triple=armv8.1m.main-arm-none-eabi -mcpu=generic -show-encoding %s \
+// RUN: | FileCheck --check-prefixes=CHECK %s
+
+// RUN: llvm-mc -triple=thumbv8.1m.main-none-eabi -mcpu=generic -show-encoding %s \
+// RUN: | FileCheck --check-prefixes=CHECK %s
+
+vlstm r8, {d0 - d31}
+// CHECK: vlstm	r8, {d0 - d31} @ encoding: [0x28,0xec,0x80,0x0a]
+
+vlldm r8, {d0 - d31}
+// CHECK: vlldm	r8, {d0 - d31} @ encoding: [0x38,0xec,0x80,0x0a]
diff --git a/llvm/test/MC/ARM/vlstm-vlldm-8m.s b/llvm/test/MC/ARM/vlstm-vlldm-8m.s
new file mode 100644
index 000000000000..bbc95318aeb3
--- /dev/null
+++ b/llvm/test/MC/ARM/vlstm-vlldm-8m.s
@@ -0,0 +1,17 @@
+// RUN: llvm-mc -triple=armv8m.main-arm-none-eabi -mcpu=generic -show-encoding %s \
+// RUN: | FileCheck --check-prefixes=CHECK %s
+
+// RUN: llvm-mc -triple=thumbv8m.main-none-eabi -mcpu=generic -show-encoding %s \
+// RUN: | FileCheck --check-prefixes=CHECK %s
+
+vlstm r8, {d0 - d15}
+// CHECK: vlstm	r8, {d0 - d15} @ encoding: [0x28,0xec,0x00,0x0a]
+
+vlldm r8, {d0 - d15}
+// CHECK: vlldm	r8, {d0 - d15} @ encoding: [0x38,0xec,0x00,0x0a]
+
+vlstm r8
+// CHECK: vlstm	r8, {d0 - d15} @ encoding: [0x28,0xec,0x00,0x0a]
+
+vlldm r8
+// CHECK: vlldm r8, {d0 - d15} @ encoding: [0x38,0xec,0x00,0x0a]
diff --git a/llvm/test/MC/ARM/vlstm-vlldm-diag.s b/llvm/test/MC/ARM/vlstm-vlldm-diag.s
new file mode 100644
index 000000000000..b57f535c6a25
--- /dev/null
+++ b/llvm/test/MC/ARM/vlstm-vlldm-diag.s
@@ -0,0 +1,61 @@
+// RUN: not llvm-mc -triple=armv8.1m.main-arm-none-eabi -mcpu=generic -show-encoding %s 2>&1 >/dev/null \
+// RUN: | FileCheck --check-prefixes=ERR %s
+
+// RUN: not llvm-mc -triple=armv8.1m.main-arm-none-eabi -mcpu=generic -show-encoding %s 2>&1 >/dev/null \
+// RUN: | FileCheck --check-prefixes=ERRT2 %s
+
+vlstm r8, {d0 - d11}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlstm r8, {d0 - d11}
+
+vlldm r8, {d0 - d11}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlldm r8, {d0 - d11}
+
+vlstm r8, {d3 - d15}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlstm r8, {d3 - d15}
+
+vlldm r8, {d3 - d15}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlldm r8, {d3 - d15}
+
+vlstm r8, {d0 - d29}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlstm r8, {d0 - d29}
+
+vlldm r8, {d0 - d29}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlldm r8, {d0 - d29}
+
+vlstm r8, {d3 - d31}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlstm r8, {d3 - d31}
+
+vlldm r8, {d3 - d31}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlldm r8, {d3 - d31}
+
+vlstm r8, {d0 - d35}
+// ERR: error: register expected
+// ERR-NEXT: vlstm r8, {d0 - d35}
+
+vlldm r8, {d0 - d35}
+// ERR: error: register expected
+// ERR-NEXT: vlldm r8, {d0 - d35}
+
+vlstm pc
+// ERR: error: operand must be a register in range [r0, r14]
+// ERR-NEXT: vlstm pc
+
+vlldm pc
+// ERR: error: operand must be a register in range [r0, r14]
+// ERR-NEXT: vlldm pc
+
+vlstm pc
+// ERRT2: error: operand must be a register in range [r0, r14]
+// ERRT2-NEXT: vlstm pc
+
+vlldm pc
+// ERRT2: error: operand must be a register in range [r0, r14]
+// ERRT2-NEXT: vlldm pc
+\ No newline at end of file
diff --git a/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.1.main.txt b/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.1.main.txt
new file mode 100644
index 000000000000..6b9882454c06
--- /dev/null
+++ b/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.1.main.txt
@@ -0,0 +1,11 @@
+// RUN: llvm-mc -triple=armv8.1m.main-arm-none-eabi -mcpu=generic -show-encoding -disassemble %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-DISS
+
+// RUN: llvm-mc -triple=thumbv8.1m.main-none-eabi -mcpu=generic -show-encoding -disassemble %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-DISS
+
+[0x28,0xec,0x80,0x0a]
+// CHECK-DISS: vlstm r8, {d0 - d31} @ encoding: [0x28,0xec,0x80,0x0a]
+
+[0x38,0xec,0x80,0x0a]
+// CHECK-DISS: vlldm r8, {d0 - d31} @ encoding: [0x38,0xec,0x80,0x0a]
+\ No newline at end of file
diff --git a/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.main.txt b/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.main.txt
new file mode 100644
index 000000000000..1e28d5284c5b
--- /dev/null
+++ b/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.main.txt
@@ -0,0 +1,17 @@
+// RUN: llvm-mc -triple=armv8m.main-arm-none-eabi -mcpu=generic -show-encoding -disassemble %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-DISS
+
+// RUN: llvm-mc -triple=thumbv8m.main-none-eabi -mcpu=generic -show-encoding -disassemble %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-DISS
+
+[0x28,0xec,0x00,0x0a]
+// CHECK-DISS: vlstm r8, {d0 - d15} @ encoding: [0x28,0xec,0x00,0x0a]
+
+[0x38,0xec,0x00,0x0a]
+// CHECK-DISS: vlldm r8, {d0 - d15} @ encoding: [0x38,0xec,0x00,0x0a]
+
+[0x28,0xec,0x00,0x0a]
+// CHECK-DISS: vlstm r8, {d0 - d15} @ encoding: [0x28,0xec,0x00,0x0a]
+
+[0x38,0xec,0x00,0x0a]
+// CHECK-DISS: vlldm r8, {d0 - d15} @ encoding: [0x38,0xec,0x00,0x0a]
+\ No newline at end of file
diff --git a/llvm/test/MC/Mips/cpsetup.s b/llvm/test/MC/Mips/cpsetup.s
index 8e587aea3e7e..4a027c6e796a 100644
--- a/llvm/test/MC/Mips/cpsetup.s
+++ b/llvm/test/MC/Mips/cpsetup.s
@@ -4,8 +4,6 @@
 # RUN: llvm-mc -triple mips-unknown-linux -target-abi o32 %s | \
 # RUN:   FileCheck -check-prefixes=ASM,ASM-O32 %s
 
-# FIXME: Now we check .cpsetup expansion for `-mno-shared` case only.
-#        We also need to implement/check the `-mshared` case.
 # RUN: llvm-mc -triple mips64-unknown-linux -target-abi n32 -filetype=obj -o - %s | \
 # RUN:   llvm-objdump --no-print-imm-hex -d -r -z - | \
 # RUN:   FileCheck -check-prefixes=ALL,NXX,N32 %s
@@ -35,11 +33,16 @@ t1:
 
 # NXX-NEXT: sd       $gp, 8($sp)
 # NXX-NEXT: lui      $gp, 0
-# N32-NEXT: R_MIPS_HI16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_HI16
 # NXX-NEXT: addiu    $gp, $gp, 0
-# N32-NEXT: R_MIPS_LO16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_LO16
+# N32-NEXT: addu     $gp, $gp, $25
 # N64-NEXT: daddu    $gp, $gp, $25
 
 # ASM-NEXT: .cpsetup $25, 8, __cerror
@@ -64,11 +67,16 @@ t2:
 
 # NXX-NEXT: move     $2, $gp
 # NXX-NEXT: lui      $gp, 0
-# N32-NEXT: R_MIPS_HI16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_HI16
 # NXX-NEXT: addiu    $gp, $gp, 0
-# N32-NEXT: R_MIPS_LO16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_LO16
+# N32-NEXT: addu     $gp, $gp, $25
 # N64-NEXT: daddu    $gp, $gp, $25
 
 # ASM-NEXT: .cpsetup $25, $2, __cerror
@@ -101,11 +109,16 @@ t3:
 
 # NXX-NEXT: move     $2, $gp
 # NXX-NEXT: lui      $gp, 0
-# N32-NEXT: {{^ *0+}}38: R_MIPS_HI16 __gnu_local_gp
 # N64-NEXT: {{^ *0+}}40: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16 .text
+# N32-NEXT: {{^ *0+}}40: R_MIPS_GPREL16 .text
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_HI16
 # NXX-NEXT: addiu    $gp, $gp, 0
-# N32-NEXT: {{^ *0+}}3c: R_MIPS_LO16 __gnu_local_gp
 # N64-NEXT: {{^ *0+}}44: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16 .text
+# N32-NEXT: {{^ *0+}}44: R_MIPS_GPREL16 .text
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_LO16
+# N32-NEXT: addu     $gp, $gp, $25
 # N64-NEXT: daddu    $gp, $gp, $25
 # NXX-NEXT: nop
 # NXX-NEXT: sub $3, $3, $2
@@ -158,11 +171,16 @@ t5:
 
 # NXX-NEXT: sd       $gp, 8($sp)
 # NXX-NEXT: lui      $gp, 0
-# N32-NEXT: R_MIPS_HI16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_HI16
 # NXX-NEXT: addiu    $gp, $gp, 0
-# N32-NEXT: R_MIPS_LO16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_LO16
+# N32-NEXT: addu     $gp, $gp, $25
 # N64-NEXT: daddu    $gp, $gp, $25
 
 # ASM-NEXT: .cpsetup $25, 8, __cerror
@@ -184,11 +202,16 @@ IMM_8 = 8
 
 # NXX-NEXT: sd       $gp, 8($sp)
 # NXX-NEXT: lui      $gp, 0
-# N32-NEXT: R_MIPS_HI16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_HI16
 # NXX-NEXT: addiu    $gp, $gp, 0
-# N32-NEXT: R_MIPS_LO16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_LO16
+# N32-NEXT: addu     $gp, $gp, $25
 # N64-NEXT: daddu    $gp, $gp, $25
 
 # ASM-NEXT: .cpsetup $25, 8, __cerror
diff --git a/llvm/test/TableGen/HwModeEncodeDecode3.td b/llvm/test/TableGen/HwModeEncodeDecode3.td
new file mode 100644
index 000000000000..406e52d25be7
--- /dev/null
+++ b/llvm/test/TableGen/HwModeEncodeDecode3.td
@@ -0,0 +1,168 @@
+// RUN: llvm-tblgen -gen-emitter -I %p/../../include %s | \
+// RUN:     FileCheck %s --check-prefix=ENCODER
+// RUN: llvm-tblgen -gen-disassembler -I %p/../../include %s | \
+// RUN:     FileCheck %s --check-prefix=DECODER
+// RUN: llvm-tblgen -gen-disassembler --suppress-per-hwmode-duplicates -I \
+// RUN:     %p/../../include %s | FileCheck %s --check-prefix=DECODER-SUPPRESS
+
+include "llvm/Target/Target.td"
+
+def archInstrInfo : InstrInfo { }
+
+def arch : Target {
+  let InstructionSet = archInstrInfo;
+}
+
+def Myi32 : Operand<i32> {
+  let DecoderMethod = "DecodeMyi32";
+}
+
+def HasA : Predicate<"Subtarget->hasA()">;
+def HasB : Predicate<"Subtarget->hasB()">;
+
+def ModeA : HwMode<"+a", [HasA]>;
+def ModeB : HwMode<"+b", [HasB]>;
+
+
+def fooTypeEncDefault : InstructionEncoding {
+  let Size = 8;
+  field bits<64> SoftFail = 0;
+  bits<64> Inst;
+  bits<8> factor;
+  let Inst{7...0} = factor;
+  let Inst{3...2} = 0b10;
+  let Inst{1...0} = 0b00;
+}
+
+def fooTypeEncA : InstructionEncoding {
+  let Size = 4;
+  field bits<32> SoftFail = 0;
+  bits<32> Inst;
+  bits<8> factor;
+  let Inst{7...0} = factor;
+  let Inst{3...2} = 0b11;
+  let Inst{1...0} = 0b00;
+}
+
+def fooTypeEncB : InstructionEncoding {
+  let Size = 4;
+  field bits<32> SoftFail = 0;
+  bits<32> Inst;
+  bits<8> factor;
+  let Inst{15...8} = factor;
+  let Inst{1...0} = 0b11;
+}
+
+// Test for DefaultMode as a selector.
+def foo : Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins i32imm:$factor);
+  let EncodingInfos = EncodingByHwMode<
+    [ModeA, ModeB, DefaultMode], [fooTypeEncA, fooTypeEncB, fooTypeEncDefault]
+  >;
+  let AsmString = "foo  $factor";
+}
+
+def bar: Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins i32imm:$factor);
+  let Size = 4;
+  bits<32> Inst;
+  bits<32> SoftFail;
+  bits<8> factor;
+  let Inst{31...24} = factor;
+  let Inst{1...0} = 0b10;
+  let AsmString = "bar  $factor";
+}
+
+def baz : Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins i32imm:$factor);
+  bits<32> Inst;
+  let EncodingInfos = EncodingByHwMode<
+    [ModeB], [fooTypeEncA]
+  >;
+  let AsmString = "foo  $factor";
+}
+
+def unrelated: Instruction {
+  let OutOperandList = (outs);
+  let DecoderNamespace = "Alt";
+  let InOperandList = (ins i32imm:$factor);
+  let Size = 4;
+  bits<32> Inst;
+  bits<32> SoftFail;
+  bits<8> factor;
+  let Inst{31...24} = factor;
+  let Inst{1...0} = 0b10;
+  let AsmString = "unrelated  $factor";
+}
+
+
+// DECODER-LABEL: DecoderTableAlt_DefaultMode32[] =
+// DECODER-DAG: Opcode: unrelated
+// DECODER-LABEL: DecoderTableAlt_ModeA32[] =
+// DECODER-DAG: Opcode: unrelated
+// DECODER-LABEL: DecoderTableAlt_ModeB32[] =
+// DECODER-DAG: Opcode: unrelated
+// DECODER-LABEL: DecoderTable_DefaultMode32[] =
+// DECODER-DAG: Opcode: bar
+// DECODER-LABEL: DecoderTable_DefaultMode64[] =
+// DECODER-DAG: Opcode: fooTypeEncDefault:foo
+// DECODER-LABEL: DecoderTable_ModeA32[] =
+// DECODER-DAG: Opcode: fooTypeEncA:foo
+// DECODER-DAG: Opcode: bar
+// DECODER-LABEL: DecoderTable_ModeB32[] =
+// DECODER-DAG: Opcode: fooTypeEncB:foo
+// DECODER-DAG: Opcode: fooTypeEncA:baz
+// DECODER-DAG: Opcode: bar
+
+
+// DECODER-SUPPRESS-LABEL: DecoderTableAlt_AllModes32[] =
+// DECODER-SUPPRESS-DAG: Opcode: unrelated
+// DECODER-SUPPRESS-LABEL: DecoderTable_AllModes32[] =
+// DECODER-SUPPRESS-DAG: Opcode: bar
+// DECODER-SUPPRESS-LABEL: DecoderTable_DefaultMode64[] =
+// DECODER-SUPPRESS-NOT: Opcode: bar
+// DECODER-SUPPRESS-DAG: Opcode: fooTypeEncDefault:foo
+// DECODER-SUPPRESS-LABEL: DecoderTable_ModeA32[] =
+// DECODER-SUPPRESS-DAG: Opcode: fooTypeEncA:foo
+// DECODER-SUPPRESS-NOT: Opcode: bar
+// DECODER-SUPPRESS-LABEL: DecoderTable_ModeB32[] =
+// DECODER-SUPPRESS-DAG: Opcode: fooTypeEncB:foo
+// DECODER-SUPPRESS-DAG: Opcode: fooTypeEncA:baz
+// DECODER-SUPPRESS-NOT: Opcode: bar
+
+// ENCODER-LABEL:   static const uint64_t InstBits_DefaultMode[] = {
+// ENCODER:         UINT64_C(2),        // bar
+// ENCODER:         UINT64_C(0),        // baz
+// ENCODER:         UINT64_C(8),        // foo
+// ENCODER:         UINT64_C(2),        // unrelated
+
+// ENCODER-LABEL:   static const uint64_t InstBits_ModeA[] = {
+// ENCODER:         UINT64_C(2),        // bar
+// ENCODER:         UINT64_C(0),        // baz
+// ENCODER:         UINT64_C(12),       // foo
+// ENCODER:         UINT64_C(2),        // unrelated
+
+// ENCODER-LABEL:   static const uint64_t InstBits_ModeB[] = {
+// ENCODER:         UINT64_C(2),        // bar
+// ENCODER:         UINT64_C(12),       // baz
+// ENCODER:         UINT64_C(3),        // foo
+// ENCODER:         UINT64_C(2),        // unrelated
+
+// ENCODER:  unsigned HwMode = STI.getHwMode();
+// ENCODER:  switch (HwMode) {
+// ENCODER:  default: llvm_unreachable("Unknown hardware mode!"); break;
+// ENCODER:  case 0: InstBits = InstBits_DefaultMode; break;
+// ENCODER:  case 1: InstBits = InstBits_ModeA; break;
+// ENCODER:  case 2: InstBits = InstBits_ModeB; break;
+// ENCODER:  };
+
+// ENCODER:     case ::foo: {
+// ENCODER:      switch (HwMode) {
+// ENCODER:      default: llvm_unreachable("Unhandled HwMode");
+// ENCODER:      case 0: {
+// ENCODER:      case 1: {
+// ENCODER:      case 2: {
+
diff --git a/llvm/test/TableGen/ReadAdvanceInvalidWrite.td b/llvm/test/TableGen/ReadAdvanceInvalidWrite.td
new file mode 100644
index 000000000000..d185cbd56f8e
--- /dev/null
+++ b/llvm/test/TableGen/ReadAdvanceInvalidWrite.td
@@ -0,0 +1,29 @@
+// RUN: not llvm-tblgen -gen-subtarget -I %p/../../include %s 2>&1 | FileCheck %s
+
+// Make sure we don't form ReadAdvances with ValidWrites entries that are not
+// associated with any instructions.
+
+include "llvm/Target/Target.td"
+
+def TargetX : Target;
+
+def WriteX : SchedWrite;
+def WriteY : SchedWrite;
+def ReadX : SchedRead;
+
+def InstX : Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins);
+  let SchedRW = [WriteX, ReadX];
+}
+
+def SchedModelX: SchedMachineModel {
+  let CompleteModel = 0;
+}
+
+let SchedModel = SchedModelX in {
+  def : ReadAdvance<ReadX, 1, [WriteX, WriteY]>;
+  // CHECK: error: ReadAdvance referencing a ValidWrite that is not used by any instruction (WriteY)
+}
+
+def ProcessorX: ProcessorModel<"ProcessorX", SchedModelX, []>;
diff --git a/llvm/test/Transforms/AggressiveInstCombine/vector-or-load.ll b/llvm/test/Transforms/AggressiveInstCombine/vector-or-load.ll
index f7d48ce1e099..170fde4ef700 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/vector-or-load.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/vector-or-load.ll
@@ -35,7 +35,7 @@ define <vscale x 8 x i16> @or-load-scalable-vector(ptr %p1) {
 ; CHECK-NEXT:    [[L2:%.*]] = load <vscale x 8 x i8>, ptr [[P2]], align 1
 ; CHECK-NEXT:    [[E1:%.*]] = zext <vscale x 8 x i8> [[L1]] to <vscale x 8 x i16>
 ; CHECK-NEXT:    [[E2:%.*]] = zext <vscale x 8 x i8> [[L2]] to <vscale x 8 x i16>
-; CHECK-NEXT:    [[S2:%.*]] = shl <vscale x 8 x i16> [[E2]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 8, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[S2:%.*]] = shl <vscale x 8 x i16> [[E2]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 8, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[OR:%.*]] = or <vscale x 8 x i16> [[E1]], [[S2]]
 ; CHECK-NEXT:    ret <vscale x 8 x i16> [[OR]]
 ;
@@ -44,7 +44,7 @@ define <vscale x 8 x i16> @or-load-scalable-vector(ptr %p1) {
   %l2 = load <vscale x 8 x i8>, ptr %p2, align 1
   %e1 = zext <vscale x 8 x i8> %l1 to <vscale x 8 x i16>
   %e2 = zext <vscale x 8 x i8> %l2 to <vscale x 8 x i16>
-  %s2 = shl <vscale x 8 x i16> %e2, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 8, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %s2 = shl <vscale x 8 x i16> %e2, splat (i16 8)
   %or = or <vscale x 8 x i16> %e1, %s2
   ret <vscale x 8 x i16> %or
 }
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
index 83226e56b5ac..e0777f9ecee8 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
@@ -30,8 +30,7 @@ define internal void @bar(%p_t %p)  {
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@bar
 ; CGSCC-SAME: (ptr nocapture nofree readnone [[P:%.*]]) #[[ATTR0]] {
-; CGSCC-NEXT:    call void @llvm.dbg.value(metadata ptr [[P]], metadata [[META3:![0-9]+]], metadata !DIExpression())
-; CGSCC-SAME:         !dbg [[DBG5:![0-9]+]]
+; CGSCC-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[P]], metadata [[META3:![0-9]+]], metadata !DIExpression()), !dbg [[DBG5:![0-9]+]]
 ; CGSCC-NEXT:    ret void
 ;
   call void @llvm.dbg.value(metadata %p_t %p, metadata !4, metadata !5), !dbg !6
@@ -52,21 +51,20 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 !6 = !DILocation(line: 1, column: 1, scope: !3)
 ;.
 ; TUNIT: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
-; TUNIT: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
 ; CGSCC: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ; CGSCC: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
-; TUNIT: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
-; TUNIT: [[META1:![0-9]+]] = !DIFile(filename: "test.c", directory: "")
+; TUNIT: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: [[META1:![0-9]+]], isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
+; TUNIT: [[META1]] = !DIFile(filename: "test.c", directory: "")
 ; TUNIT: [[META2:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
 ;.
-; CGSCC: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
-; CGSCC: [[META1:![0-9]+]] = !DIFile(filename: "test.c", directory: "")
+; CGSCC: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: [[META1:![0-9]+]], isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
+; CGSCC: [[META1]] = !DIFile(filename: "test.c", directory: "")
 ; CGSCC: [[META2:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; CGSCC: [[META3]] = !DILocalVariable(name: "p", scope: !4)
-; CGSCC: [[META4:![0-9]+]] = distinct !DISubprogram(name: "bar", scope: null, spFlags: DISPFlagDefinition, unit: !0)
-; CGSCC: [[DBG5]] = !DILocation(line: 1, column: 1, scope: !4)
+; CGSCC: [[META3]] = !DILocalVariable(name: "p", scope: [[META4:![0-9]+]])
+; CGSCC: [[META4]] = distinct !DISubprogram(name: "bar", scope: null, spFlags: DISPFlagDefinition, unit: [[META0]])
+; CGSCC: [[DBG5]] = !DILocation(line: 1, column: 1, scope: [[META4]])
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; TUNIT: {{.*}}
diff --git a/llvm/test/Transforms/ConstraintElimination/loops-bottom-tested-pointer-cmps.ll b/llvm/test/Transforms/ConstraintElimination/loops-bottom-tested-pointer-cmps.ll
index e3f2a54f321e..279238bea184 100644
--- a/llvm/test/Transforms/ConstraintElimination/loops-bottom-tested-pointer-cmps.ll
+++ b/llvm/test/Transforms/ConstraintElimination/loops-bottom-tested-pointer-cmps.ll
@@ -93,9 +93,8 @@ define void @some_checks_in_loops_removable(ptr %ptr, ptr %lower, ptr %upper, i8
 ; CHECK:       loop.body:
 ; CHECK-NEXT:    [[IV_1:%.*]] = add nuw nsw i16 [[IV]], 1
 ; CHECK-NEXT:    [[PTR_IV_1:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i16 [[IV_1]]
-; CHECK-NEXT:    [[CMP_PTR_IV_1_LOWER:%.*]] = icmp ugt ptr [[LOWER]], [[PTR_IV_1]]
 ; CHECK-NEXT:    [[CMP_PTR_IV_1_UPPER:%.*]] = icmp ule ptr [[UPPER]], [[PTR_IV_1]]
-; CHECK-NEXT:    [[OR_1:%.*]] = or i1 [[CMP_PTR_IV_1_LOWER]], [[CMP_PTR_IV_1_UPPER]]
+; CHECK-NEXT:    [[OR_1:%.*]] = or i1 false, [[CMP_PTR_IV_1_UPPER]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[TRAP]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    store i8 0, ptr [[PTR_IV]], align 4
@@ -171,9 +170,8 @@ define void @no_checks_in_loops_removable(ptr %ptr, ptr %lower, ptr %upper, i8 %
 ; CHECK:       loop.body:
 ; CHECK-NEXT:    [[IV_1:%.*]] = add nuw nsw i16 [[IV]], 1
 ; CHECK-NEXT:    [[PTR_IV_1:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i16 [[IV_1]]
-; CHECK-NEXT:    [[CMP_PTR_IV_1_LOWER:%.*]] = icmp ugt ptr [[LOWER]], [[PTR_IV_1]]
 ; CHECK-NEXT:    [[CMP_PTR_IV_1_UPPER:%.*]] = icmp ule ptr [[UPPER]], [[PTR_IV_1]]
-; CHECK-NEXT:    [[OR_1:%.*]] = or i1 [[CMP_PTR_IV_1_LOWER]], [[CMP_PTR_IV_1_UPPER]]
+; CHECK-NEXT:    [[OR_1:%.*]] = or i1 false, [[CMP_PTR_IV_1_UPPER]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[TRAP]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    store i8 0, ptr [[PTR_IV]], align 4
diff --git a/llvm/test/Transforms/ConstraintElimination/loops-header-tested-pointer-cmps.ll b/llvm/test/Transforms/ConstraintElimination/loops-header-tested-pointer-cmps.ll
index 66ce1ffc6ebc..1842ca2d8254 100644
--- a/llvm/test/Transforms/ConstraintElimination/loops-header-tested-pointer-cmps.ll
+++ b/llvm/test/Transforms/ConstraintElimination/loops-header-tested-pointer-cmps.ll
@@ -27,18 +27,16 @@ define void @test1(ptr %src, ptr noundef %lower, ptr noundef %upper, i8 %N) {
 ; CHECK-NEXT:    store i32 0, ptr [[PTR_SRC_IV]], align 4
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i8 [[IV]], 1
 ; CHECK-NEXT:    [[SRC_IV_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[ADD_1]]
-; CHECK-NEXT:    [[CMP_IV_1_START:%.*]] = icmp ult ptr [[SRC_IV_1]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_1_END:%.*]] = icmp uge ptr [[SRC_IV_1]], [[UPPER]]
-; CHECK-NEXT:    [[OR_2:%.*]] = or i1 [[CMP_IV_1_START]], [[CMP_IV_1_END]]
+; CHECK-NEXT:    [[OR_2:%.*]] = or i1 false, [[CMP_IV_1_END]]
 ; CHECK-NEXT:    br i1 [[OR_2]], label [[TRAP_BB]], label [[LOOP_BODY_2:%.*]]
 ; CHECK:       loop.body.2:
 ; CHECK-NEXT:    [[PTR_SRC_IV_1:%.*]] = bitcast ptr [[SRC_IV_1]] to ptr
 ; CHECK-NEXT:    store i32 0, ptr [[PTR_SRC_IV_1]], align 4
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i8 [[IV]], 2
 ; CHECK-NEXT:    [[SRC_IV_2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[ADD_2]]
-; CHECK-NEXT:    [[CMP_IV_2_START:%.*]] = icmp ult ptr [[SRC_IV_2]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_2_END:%.*]] = icmp uge ptr [[SRC_IV_2]], [[UPPER]]
-; CHECK-NEXT:    [[OR_3:%.*]] = or i1 [[CMP_IV_2_START]], [[CMP_IV_2_END]]
+; CHECK-NEXT:    [[OR_3:%.*]] = or i1 false, [[CMP_IV_2_END]]
 ; CHECK-NEXT:    br i1 [[OR_3]], label [[TRAP_BB]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[PTR_SRC_IV_2:%.*]] = bitcast ptr [[SRC_IV_2]] to ptr
@@ -125,16 +123,14 @@ define void @test2(ptr %src, ptr %lower, ptr %upper, i8 %N) {
 ; CHECK:       loop.body.1:
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i8 [[IV]], 1
 ; CHECK-NEXT:    [[SRC_IV_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[ADD_1]]
-; CHECK-NEXT:    [[CMP_IV_1_START:%.*]] = icmp ult ptr [[SRC_IV_1]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_1_END:%.*]] = icmp uge ptr [[SRC_IV_1]], [[UPPER]]
-; CHECK-NEXT:    [[OR_2:%.*]] = or i1 [[CMP_IV_1_START]], [[CMP_IV_1_END]]
+; CHECK-NEXT:    [[OR_2:%.*]] = or i1 false, [[CMP_IV_1_END]]
 ; CHECK-NEXT:    br i1 [[OR_2]], label [[TRAP_BB]], label [[LOOP_BODY_2:%.*]]
 ; CHECK:       loop.body.2:
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i8 [[IV]], 2
 ; CHECK-NEXT:    [[SRC_IV_2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[ADD_2]]
-; CHECK-NEXT:    [[CMP_IV_2_START:%.*]] = icmp ult ptr [[SRC_IV_2]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_2_END:%.*]] = icmp uge ptr [[SRC_IV_2]], [[UPPER]]
-; CHECK-NEXT:    [[OR_3:%.*]] = or i1 [[CMP_IV_2_START]], [[CMP_IV_2_END]]
+; CHECK-NEXT:    [[OR_3:%.*]] = or i1 false, [[CMP_IV_2_END]]
 ; CHECK-NEXT:    br i1 [[OR_3]], label [[TRAP_BB]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[PTR:%.*]] = bitcast ptr [[SRC_IV]] to ptr
@@ -221,16 +217,14 @@ define void @test2_with_ne(ptr %src, ptr %lower, ptr %upper, i8 %N) {
 ; CHECK:       loop.body.1:
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i8 [[IV]], 1
 ; CHECK-NEXT:    [[SRC_IV_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[ADD_1]]
-; CHECK-NEXT:    [[CMP_IV_1_START:%.*]] = icmp ult ptr [[SRC_IV_1]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_1_END:%.*]] = icmp uge ptr [[SRC_IV_1]], [[UPPER]]
-; CHECK-NEXT:    [[OR_2:%.*]] = or i1 [[CMP_IV_1_START]], [[CMP_IV_1_END]]
+; CHECK-NEXT:    [[OR_2:%.*]] = or i1 false, [[CMP_IV_1_END]]
 ; CHECK-NEXT:    br i1 [[OR_2]], label [[TRAP_BB]], label [[LOOP_BODY_2:%.*]]
 ; CHECK:       loop.body.2:
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i8 [[IV]], 2
 ; CHECK-NEXT:    [[SRC_IV_2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[ADD_2]]
-; CHECK-NEXT:    [[CMP_IV_2_START:%.*]] = icmp ult ptr [[SRC_IV_2]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_2_END:%.*]] = icmp uge ptr [[SRC_IV_2]], [[UPPER]]
-; CHECK-NEXT:    [[OR_3:%.*]] = or i1 [[CMP_IV_2_START]], [[CMP_IV_2_END]]
+; CHECK-NEXT:    [[OR_3:%.*]] = or i1 false, [[CMP_IV_2_END]]
 ; CHECK-NEXT:    br i1 [[OR_3]], label [[TRAP_BB]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[PTR:%.*]] = bitcast ptr [[SRC_IV]] to ptr
@@ -316,16 +310,14 @@ define void @test3(ptr %src, ptr %lower, ptr %upper, i8 %N) {
 ; CHECK-NEXT:    br i1 [[OR_1]], label [[TRAP_BB]], label [[LOOP_BODY_1:%.*]]
 ; CHECK:       loop.body.1:
 ; CHECK-NEXT:    [[SRC_IV_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[NEXT]]
-; CHECK-NEXT:    [[CMP_IV_1_START:%.*]] = icmp ult ptr [[SRC_IV_1]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_1_END:%.*]] = icmp uge ptr [[SRC_IV_1]], [[UPPER]]
-; CHECK-NEXT:    [[OR_2:%.*]] = or i1 [[CMP_IV_1_START]], [[CMP_IV_1_END]]
+; CHECK-NEXT:    [[OR_2:%.*]] = or i1 false, [[CMP_IV_1_END]]
 ; CHECK-NEXT:    br i1 [[OR_2]], label [[TRAP_BB]], label [[LOOP_BODY_2:%.*]]
 ; CHECK:       loop.body.2:
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i8 [[IV]], 2
 ; CHECK-NEXT:    [[SRC_IV_2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[ADD_2]]
-; CHECK-NEXT:    [[CMP_IV_2_START:%.*]] = icmp ult ptr [[SRC_IV_2]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_2_END:%.*]] = icmp uge ptr [[SRC_IV_2]], [[UPPER]]
-; CHECK-NEXT:    [[OR_3:%.*]] = or i1 [[CMP_IV_2_START]], [[CMP_IV_2_END]]
+; CHECK-NEXT:    [[OR_3:%.*]] = or i1 false, [[CMP_IV_2_END]]
 ; CHECK-NEXT:    br i1 [[OR_3]], label [[TRAP_BB]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[PTR:%.*]] = bitcast ptr [[SRC_IV]] to ptr
diff --git a/llvm/test/Transforms/ConstraintElimination/sext.ll b/llvm/test/Transforms/ConstraintElimination/sext-signed-predicates.ll
index 5a8a37d0d570..34f023e4ba5f 100644
--- a/llvm/test/Transforms/ConstraintElimination/sext.ll
+++ b/llvm/test/Transforms/ConstraintElimination/sext-signed-predicates.ll
@@ -127,36 +127,6 @@ else:
 
 ; Negative tests
 
-define i1 @cmp_sext_unsigned(i32 %a, i32 %b){
-; CHECK-LABEL: define i1 @cmp_sext_unsigned(
-; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[A]], [[B]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
-; CHECK:       then:
-; CHECK-NEXT:    [[SA:%.*]] = sext i32 [[A]] to i64
-; CHECK-NEXT:    [[SB:%.*]] = sext i32 [[B]] to i64
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SA]], 1
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i64 [[SB]], [[ADD]]
-; CHECK-NEXT:    ret i1 [[CMP2]]
-; CHECK:       else:
-; CHECK-NEXT:    ret i1 false
-;
-entry:
-  %cmp = icmp slt i32 %a, %b
-  br i1 %cmp, label %then, label %else
-
-then:
-  %sa = sext i32 %a to i64
-  %sb = sext i32 %b to i64
-  %add = add nsw i64 %sa, 1
-  %cmp2 = icmp uge i64 %sb, %add
-  ret i1 %cmp2
-
-else:
-  ret i1 false
-}
-
 define i1 @cmp_zext(i32 %a, i32 %b){
 ; CHECK-LABEL: define i1 @cmp_zext(
 ; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
@@ -246,3 +216,55 @@ then:
 else:
   ret i1 false
 }
+
+declare void @use(i1)
+
+define void @sge_sext(i16 %x, i32 %y) {
+; CHECK-LABEL: define void @sge_sext(
+; CHECK-SAME: i16 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X_EXT:%.*]] = sext i16 [[X]] to i32
+; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[X_EXT]], [[Y]]
+; CHECK-NEXT:    [[C_2:%.*]] = icmp sge i32 [[Y]], -10
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[C_1]], [[C_2]]
+; CHECK-NEXT:    br i1 [[AND]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    [[T_3:%.*]] = icmp sge i32 [[X_EXT]], -9
+; CHECK-NEXT:    call void @use(i1 [[T_3]])
+; CHECK-NEXT:    [[C_3:%.*]] = icmp sge i32 [[X_EXT]], -9
+; CHECK-NEXT:    call void @use(i1 [[C_3]])
+; CHECK-NEXT:    [[C_4:%.*]] = icmp sge i32 [[Y]], [[X_EXT]]
+; CHECK-NEXT:    call void @use(i1 [[C_4]])
+; CHECK-NEXT:    [[C_5:%.*]] = icmp sge i16 [[X]], -9
+; CHECK-NEXT:    call void @use(i1 [[C_5]])
+; CHECK-NEXT:    ret void
+; CHECK:       bb2:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %x.ext = sext i16 %x to i32
+  %c.1 = icmp sge i32 %x.ext, %y
+  %c.2 = icmp sge i32 %y, -10
+  %and = and i1 %c.1, %c.2
+  br i1 %and, label %bb1, label %bb2
+
+bb1:
+  %t.1 = icmp sge i32 %x.ext, %y
+  call void @use(i1 %t.1)
+  %t.2 = icmp sge i16 %x, -10
+  call void @use(i1 %t.2)
+  %t.3 = icmp sge i32 %x.ext, -9
+  call void @use(i1 %t.3)
+  %c.3 = icmp sge i32 %x.ext, -9
+  call void @use(i1 %c.3)
+  %c.4 = icmp sge i32 %y, %x.ext
+  call void @use(i1 %c.4)
+  %c.5 = icmp sge i16 %x, -9
+  call void @use(i1 %c.5)
+  ret void
+
+bb2:
+  ret void
+}
diff --git a/llvm/test/Transforms/ConstraintElimination/sext-unsigned-predicates.ll b/llvm/test/Transforms/ConstraintElimination/sext-unsigned-predicates.ll
new file mode 100644
index 000000000000..ac3e57768ae5
--- /dev/null
+++ b/llvm/test/Transforms/ConstraintElimination/sext-unsigned-predicates.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=constraint-elimination -S %s | FileCheck %s
+
+declare void @use(i1)
+
+define void @uge_sext(i16 %x, i32 %y) {
+; CHECK-LABEL: define void @uge_sext(
+; CHECK-SAME: i16 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X_EXT:%.*]] = sext i16 [[X]] to i32
+; CHECK-NEXT:    [[C_1:%.*]] = icmp uge i32 [[X_EXT]], [[Y]]
+; CHECK-NEXT:    [[C_2:%.*]] = icmp uge i32 [[Y]], -10
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[C_1]], [[C_2]]
+; CHECK-NEXT:    br i1 [[AND]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    [[C_3:%.*]] = icmp uge i16 [[X]], -10
+; CHECK-NEXT:    call void @use(i1 [[C_3]])
+; CHECK-NEXT:    [[C_4:%.*]] = icmp uge i32 [[X_EXT]], -9
+; CHECK-NEXT:    call void @use(i1 [[C_4]])
+; CHECK-NEXT:    [[C_5:%.*]] = icmp uge i32 [[X_EXT]], -9
+; CHECK-NEXT:    call void @use(i1 [[C_5]])
+; CHECK-NEXT:    [[C_6:%.*]] = icmp uge i32 [[Y]], [[X_EXT]]
+; CHECK-NEXT:    call void @use(i1 [[C_6]])
+; CHECK-NEXT:    [[C_7:%.*]] = icmp uge i16 [[X]], -9
+; CHECK-NEXT:    call void @use(i1 [[C_7]])
+; CHECK-NEXT:    ret void
+; CHECK:       bb2:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %x.ext = sext i16 %x to i32
+  %c.1 = icmp uge i32 %x.ext, %y
+  %c.2 = icmp uge i32 %y, -10
+  %and = and i1 %c.1, %c.2
+  br i1 %and, label %bb1, label %bb2
+
+bb1:
+  %t.1 = icmp uge i32 %x.ext, %y
+  call void @use(i1 %t.1)
+  %c.3 = icmp uge i16 %x, -10
+  call void @use(i1 %c.3)
+  %c.4 = icmp uge i32 %x.ext, -9
+  call void @use(i1 %c.4)
+  %c.5 = icmp uge i32 %x.ext, -9
+  call void @use(i1 %c.5)
+  %c.6 = icmp uge i32 %y, %x.ext
+  call void @use(i1 %c.6)
+  %c.7 = icmp uge i16 %x, -9
+  call void @use(i1 %c.7)
+  ret void
+
+bb2:
+  ret void
+}
+
+define void @uge_sext_known_positive(i16 %x, i32 %y) {
+; CHECK-LABEL: define void @uge_sext_known_positive(
+; CHECK-SAME: i16 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X_EXT:%.*]] = sext i16 [[X]] to i32
+; CHECK-NEXT:    [[C_2:%.*]] = icmp sge i16 [[X]], 0
+; CHECK-NEXT:    [[C_1:%.*]] = icmp uge i32 [[X_EXT]], 10
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[C_2]], [[C_1]]
+; CHECK-NEXT:    br i1 [[AND]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    [[T_2:%.*]] = icmp uge i16 [[X]], 10
+; CHECK-NEXT:    call void @use(i1 [[T_2]])
+; CHECK-NEXT:    [[C_3:%.*]] = icmp uge i32 [[X_EXT]], 11
+; CHECK-NEXT:    call void @use(i1 [[C_3]])
+; CHECK-NEXT:    [[C_4:%.*]] = icmp uge i32 [[X_EXT]], 11
+; CHECK-NEXT:    call void @use(i1 [[C_4]])
+; CHECK-NEXT:    [[C_5:%.*]] = icmp uge i32 [[Y]], [[X_EXT]]
+; CHECK-NEXT:    call void @use(i1 [[C_5]])
+; CHECK-NEXT:    [[C_6:%.*]] = icmp uge i16 [[X]], 11
+; CHECK-NEXT:    call void @use(i1 [[C_6]])
+; CHECK-NEXT:    ret void
+; CHECK:       bb2:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %x.ext = sext i16 %x to i32
+  %c.2 = icmp sge i16 %x, 0
+  %c.1 = icmp uge i32 %x.ext, 10
+  %and = and i1 %c.2, %c.1
+  br i1 %and, label %bb1, label %bb2
+
+bb1:
+  %t.1 = icmp uge i32 %x.ext, 10
+  call void @use(i1 %t.1)
+  %t.2 = icmp uge i16 %x, 10
+  call void @use(i1 %t.2)
+  %c.3 = icmp uge i32 %x.ext, 11
+  call void @use(i1 %c.3)
+  %c.4 = icmp uge i32 %x.ext, 11
+  call void @use(i1 %c.4)
+  %c.5 = icmp uge i32 %y, %x.ext
+  call void @use(i1 %c.5)
+  %c.6 = icmp uge i16 %x, 11
+  call void @use(i1 %c.6)
+  ret void
+
+bb2:
+  ret void
+}
+
+define i1 @cmp_sext_unsigned(i32 %a, i32 %b){
+; CHECK-LABEL: define i1 @cmp_sext_unsigned(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[A]], [[B]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[SA:%.*]] = sext i32 [[A]] to i64
+; CHECK-NEXT:    [[SB:%.*]] = sext i32 [[B]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SA]], 1
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i64 [[SB]], [[ADD]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cmp = icmp slt i32 %a, %b
+  br i1 %cmp, label %then, label %else
+
+then:
+  %sa = sext i32 %a to i64
+  %sb = sext i32 %b to i64
+  %add = add nsw i64 %sa, 1
+  %cmp2 = icmp uge i64 %sb, %add
+  ret i1 %cmp2
+
+else:
+  ret i1 false
+}
diff --git a/llvm/test/Transforms/ConstraintElimination/zext-for-per-formula-reasoning.ll b/llvm/test/Transforms/ConstraintElimination/zext-for-per-formula-reasoning.ll
index 63f5d4d4ba34..7844651a01f9 100644
--- a/llvm/test/Transforms/ConstraintElimination/zext-for-per-formula-reasoning.ll
+++ b/llvm/test/Transforms/ConstraintElimination/zext-for-per-formula-reasoning.ll
@@ -90,11 +90,9 @@ define i1 @gep_zext_idx_adds(ptr %p, i8 %cnt, i8 %off) {
 ; CHECK-NEXT:    [[EXT:%.*]] = zext i8 [[CNT]] to i16
 ; CHECK-NEXT:    [[EXT_1:%.*]] = add nuw nsw i16 [[EXT]], 1
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i16 [[EXT_1]]
-; CHECK-NEXT:    [[T_1:%.*]] = icmp uge ptr [[ADD_PTR]], [[P]]
-; CHECK-NEXT:    [[F_1:%.*]] = icmp ult ptr [[ADD_PTR]], [[P]]
 ; CHECK-NEXT:    [[GEP_11:%.*]] = getelementptr inbounds i32, ptr [[P]], i16 11
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp uge ptr [[ADD_PTR]], [[GEP_11]]
-; CHECK-NEXT:    [[RES_1:%.*]] = xor i1 [[T_1]], [[F_1]]
+; CHECK-NEXT:    [[RES_1:%.*]] = xor i1 true, false
 ; CHECK-NEXT:    [[RES_2:%.*]] = xor i1 [[RES_1]], [[C_1]]
 ; CHECK-NEXT:    ret i1 [[RES_2]]
 ;
diff --git a/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll b/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
index 0dee72f4f6b6..5e49437db0a7 100644
--- a/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
+++ b/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
@@ -8,12 +8,21 @@
 
 define ptr @vfs_addname(ptr %name, i32 %len, i32 %hash, i32 %flags) nounwind ssp !dbg !1 {
 ;
+; CHECK-LABEL: define {{[^@]+}}@vfs_addname
+; CHECK-SAME: (ptr [[NAME:%.*]], i32 [[LEN:%.*]], i32 [[HASH:%.*]], i32 [[FLAGS:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG4:![0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[NAME]], metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12:![0-9]+]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[LEN]], metadata [[META13:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[HASH]], metadata [[META14:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[FLAGS]], metadata [[META15:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call fastcc ptr @add_name_internal(ptr [[NAME]], i32 [[HASH]]) #[[ATTR3:[0-9]+]], !dbg [[DBG16:![0-9]+]]
+; CHECK-NEXT:    ret ptr [[TMP0]], !dbg [[DBG16]]
+;
 entry:
   call void @llvm.dbg.value(metadata ptr %name, metadata !0, metadata !DIExpression()), !dbg !DILocation(scope: !1)
   call void @llvm.dbg.value(metadata i32 %len, metadata !10, metadata !DIExpression()), !dbg !DILocation(scope: !1)
   call void @llvm.dbg.value(metadata i32 %hash, metadata !11, metadata !DIExpression()), !dbg !DILocation(scope: !1)
   call void @llvm.dbg.value(metadata i32 %flags, metadata !12, metadata !DIExpression()), !dbg !DILocation(scope: !1)
-; CHECK:  call fastcc ptr @add_name_internal(ptr %name, i32 %hash) [[NUW:#[0-9]+]], !dbg !{{[0-9]+}}
   %0 = call fastcc ptr @add_name_internal(ptr %name, i32 %len, i32 %hash, i8 zeroext 0, i32 %flags) nounwind, !dbg !13 ; <ptr> [#uses=1]
   ret ptr %0, !dbg !13
 }
@@ -22,6 +31,24 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define internal fastcc ptr @add_name_internal(ptr %name, i32 %len, i32 %hash, i8 zeroext %extra, i32 %flags) noinline nounwind ssp !dbg !16 {
 ;
+; CHECK-LABEL: define {{[^@]+}}@add_name_internal
+; CHECK-SAME: (ptr [[NAME:%.*]], i32 [[HASH:%.*]]) #[[ATTR1:[0-9]+]] !dbg [[DBG18:![0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[NAME]], metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23:![0-9]+]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 poison, metadata [[META24:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[HASH]], metadata [[META25:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i8 poison, metadata [[META26:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 poison, metadata [[META27:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[HASH]], 0, !dbg [[DBG28:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[BB:%.*]], label [[BB1:%.*]], !dbg [[DBG28]]
+; CHECK:       bb:
+; CHECK-NEXT:    br label [[BB2:%.*]], !dbg [[DBG30:![0-9]+]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2]], !dbg [[DBG31:![0-9]+]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[DOT0:%.*]] = phi ptr [ @.str, [[BB]] ], [ [[NAME]], [[BB1]] ]
+; CHECK-NEXT:    ret ptr [[DOT0]], !dbg [[DBG31]]
+;
 entry:
   call void @llvm.dbg.value(metadata ptr %name, metadata !15, metadata !DIExpression()), !dbg !DILocation(scope: !16)
   call void @llvm.dbg.value(metadata i32 %len, metadata !20, metadata !DIExpression()), !dbg !DILocation(scope: !16)
@@ -47,7 +74,6 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
 ; CHECK: attributes #0 = { nounwind ssp }
 ; CHECK: attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #2 = { noinline nounwind ssp }
-; CHECK: attributes [[NUW]] = { nounwind }
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!30}
@@ -68,9 +94,7 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
 !14 = distinct !DILexicalBlock(line: 12, column: 0, file: !28, scope: !1)
 !15 = !DILocalVariable(name: "name", line: 17, arg: 1, scope: !16, file: !2, type: !6)
 ; CHECK: !DISubprogram(name: "add_name_internal"
-; CHECK-SAME: type: ![[MD:[0-9]+]]
 !16 = distinct !DISubprogram(name: "add_name_internal", linkageName: "add_name_internal", line: 22, isLocal: true, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, file: !28, scope: !2, type: !17)
-; CHECK: ![[MD]] = !DISubroutineType(cc: DW_CC_nocall, types: !{{[0-9]+}})
 !17 = !DISubroutineType(types: !18)
 !18 = !{!6, !6, !9, !9, !19, !9}
 !19 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned char", size: 8, align: 8, encoding: DW_ATE_unsigned_char)
diff --git a/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll b/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll
index f932788c73e2..bf846c310a52 100644
--- a/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll
+++ b/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll
@@ -51,14 +51,7 @@ entry:
   ret void
 }
 
-; CHECK: define internal void @outlined_ir_func_0(ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], ptr [[ARG2:%.*]]) #1 {
 ; CHECK: entry_to_outline:
-; CHECK-NEXT:    store i32 2, ptr [[ARG0]], align 4
-; CHECK-NEXT:    store i32 3, ptr [[ARG1]], align 4
-; CHECK-NEXT:    store i32 4, ptr [[ARG2]], align 4
-; CHECK-NEXT:    [[AL:%.*]] = load i32, ptr [[ARG0]], align 4
-; CHECK-NEXT:    [[BL:%.*]] = load i32, ptr [[ARG1]], align 4
-; CHECK-NEXT:    [[CL:%.*]] = load i32, ptr [[ARG2]], align 4
 
 !0 = !DIFile(filename: "foo.c", directory: "/tmp")
 !1 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
diff --git a/llvm/test/Transforms/InstCombine/AArch64/VectorUtils_heuristics.ll b/llvm/test/Transforms/InstCombine/AArch64/VectorUtils_heuristics.ll
index b98036693ba0..cf5fc5ed7da0 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/VectorUtils_heuristics.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/VectorUtils_heuristics.ll
@@ -10,9 +10,9 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK: unreachable
 define void @novel_algorithm() {
 entry:
-  %a = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 1, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> undef, i1 true, i32 0), <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i8> undef)
+  %a = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 1, <vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> undef)
   %b = add <vscale x 16 x i8> undef, %a
-  call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %b, ptr undef, i32 1, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> undef, i1 true, i32 0), <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer))
+  call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %b, ptr undef, i32 1, <vscale x 16 x i1> splat (i1 true))
   unreachable
 }
 
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-sdiv.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-sdiv.ll
index 6d91fd6bb850..68d871355e2a 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-sdiv.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-sdiv.ll
@@ -8,7 +8,7 @@ define <vscale x 4 x i32> @sdiv_i32(<vscale x 4 x i32> %a, <vscale x 4 x i1> %pg
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.asrd.nxv4i32(<vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]], i32 23)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
-  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 8388608, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 8388608))
   ret <vscale x 4 x i32> %out
 }
 
@@ -18,7 +18,7 @@ define <vscale x 4 x i32> @sdiv_i32_neg(<vscale x 4 x i32> %a, <vscale x 4 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.neg.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 ;
-  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -8388608, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 -8388608))
   ret <vscale x 4 x i32> %out
 }
 
@@ -27,7 +27,7 @@ define <vscale x 2 x i64> @sdiv_i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.asrd.nxv2i64(<vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]], i32 23)
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
-  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sdiv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 8388608, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer))
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sdiv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 8388608))
   ret <vscale x 2 x i64> %out
 }
 
@@ -37,25 +37,25 @@ define <vscale x 2 x i64> @sdiv_i64_neg(<vscale x 2 x i64> %a, <vscale x 2 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[TMP1]])
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 ;
-  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sdiv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 -8388608, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer))
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sdiv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 -8388608))
   ret <vscale x 2 x i64> %out
 }
 
 define <vscale x 4 x i32> @sdiv_i32_not_base2(<vscale x 4 x i32> %a, <vscale x 4 x i1> %pg) #0 {
 ; CHECK-LABEL: @sdiv_i32_not_base2(
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 8388607, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 8388607, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[OUT]]
 ;
-  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 8388607, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 8388607))
   ret <vscale x 4 x i32> %out
 }
 
 define <vscale x 4 x i32> @sdiv_i32_not_base2_neg(<vscale x 4 x i32> %a, <vscale x 4 x i1> %pg) #0 {
 ; CHECK-LABEL: @sdiv_i32_not_base2_neg(
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -8388607, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -8388607, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[OUT]]
 ;
-  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -8388607, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 -8388607))
   ret <vscale x 4 x i32> %out
 }
 
@@ -64,7 +64,7 @@ define <vscale x 4 x i32> @sdiv_i32_not_zero(<vscale x 4 x i32> %a, <vscale x 4
 ; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[OUT]]
 ;
-  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 0, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 0))
   ret <vscale x 4 x i32> %out
 }
 
diff --git a/llvm/test/Transforms/InstCombine/add.ll b/llvm/test/Transforms/InstCombine/add.ll
index 6242fc6f528a..522dcf8db27f 100644
--- a/llvm/test/Transforms/InstCombine/add.ll
+++ b/llvm/test/Transforms/InstCombine/add.ll
@@ -2574,12 +2574,12 @@ define i16 @add_sub_zext_constant(i8 %x) {
 
 define <vscale x 1 x i32> @add_to_or_scalable(<vscale x 1 x i32> %in) {
 ; CHECK-LABEL: @add_to_or_scalable(
-; CHECK-NEXT:    [[SHL:%.*]] = shl <vscale x 1 x i32> [[IN:%.*]], shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 1, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
-; CHECK-NEXT:    [[ADD:%.*]] = or disjoint <vscale x 1 x i32> [[SHL]], shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 1, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
+; CHECK-NEXT:    [[SHL:%.*]] = shl <vscale x 1 x i32> [[IN:%.*]], shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 1, i64 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
+; CHECK-NEXT:    [[ADD:%.*]] = or disjoint <vscale x 1 x i32> [[SHL]], shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 1, i64 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 1 x i32> [[ADD]]
 ;
-  %shl = shl <vscale x 1 x i32> %in, shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 1, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
-  %add = add <vscale x 1 x i32> %shl, shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 1, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
+  %shl = shl <vscale x 1 x i32> %in, splat (i32 1)
+  %add = add <vscale x 1 x i32> %shl, splat (i32 1)
   ret <vscale x 1 x i32> %add
 }
 
diff --git a/llvm/test/Transforms/InstCombine/bitcast.ll b/llvm/test/Transforms/InstCombine/bitcast.ll
index 58bd81297b0d..176b432ea0b5 100644
--- a/llvm/test/Transforms/InstCombine/bitcast.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast.ll
@@ -577,7 +577,7 @@ define <vscale x 1 x i32> @ScalableAll111(<vscale x 1 x i32> %in) {
 ; CHECK-LABEL: @ScalableAll111(
 ; CHECK-NEXT:    ret <vscale x 1 x i32> [[IN:%.*]]
 ;
-  %out = and <vscale x 1 x i32> %in, bitcast (<vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> undef, i16 -1, i32 0), <vscale x 2 x i16> undef, <vscale x 2 x i32> zeroinitializer) to <vscale x 1 x i32>)
+  %out = and <vscale x 1 x i32> %in, bitcast (<vscale x 2 x i16> splat (i16 -1) to <vscale x 1 x i32>)
   ret <vscale x 1 x i32> %out
 }
 
diff --git a/llvm/test/Transforms/InstCombine/div.ll b/llvm/test/Transforms/InstCombine/div.ll
index 80847dd588ea..1309dee817cf 100644
--- a/llvm/test/Transforms/InstCombine/div.ll
+++ b/llvm/test/Transforms/InstCombine/div.ll
@@ -1065,7 +1065,7 @@ define <vscale x 2 x i8> @sdiv_by_negconst_nxv2i8(<vscale x 2 x i8> %x) {
 ; CHECK-NEXT:    [[DIV_NEG:%.*]] = sdiv <vscale x 2 x i8> [[X:%.*]], shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 108, i64 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i8> [[DIV_NEG]]
 ;
-  %div = sdiv <vscale x 2 x i8> %x, shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 -108, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
+  %div = sdiv <vscale x 2 x i8> %x, splat (i8 -108)
   %sub = sub <vscale x 2 x i8> zeroinitializer, %div
   ret <vscale x 2 x i8> %sub
 }
@@ -1083,11 +1083,11 @@ define <2 x i8> @sdiv_by_minSigned_v2i8(<2 x i8> %x) {
 
 define <vscale x 2 x i8> @sdiv_by_minSigned_nxv2i8(<vscale x 2 x i8> %x) {
 ; CHECK-LABEL: @sdiv_by_minSigned_nxv2i8(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <vscale x 2 x i8> [[X:%.*]], shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 -128, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <vscale x 2 x i8> [[X:%.*]], shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 -128, i64 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[DIV_NEG:%.*]] = sext <vscale x 2 x i1> [[TMP1]] to <vscale x 2 x i8>
 ; CHECK-NEXT:    ret <vscale x 2 x i8> [[DIV_NEG]]
 ;
-  %div = sdiv <vscale x 2 x i8> %x, shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 -128, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
+  %div = sdiv <vscale x 2 x i8> %x, splat (i8 -128)
   %sub = sub <vscale x 2 x i8> zeroinitializer, %div
   ret <vscale x 2 x i8> %sub
 }
diff --git a/llvm/test/Transforms/InstCombine/fdiv.ll b/llvm/test/Transforms/InstCombine/fdiv.ll
index 448f9f8d7dd6..a0710c2bb048 100644
--- a/llvm/test/Transforms/InstCombine/fdiv.ll
+++ b/llvm/test/Transforms/InstCombine/fdiv.ll
@@ -90,7 +90,7 @@ define <vscale x 2 x float> @exact_inverse_scalable_splat(<vscale x 2 x float> %
 ; CHECK-NEXT:    [[DIV:%.*]] = fmul <vscale x 2 x float> [[X:%.*]], shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 2.500000e-01, i64 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x float> [[DIV]]
 ;
-  %div = fdiv <vscale x 2 x float> %x, shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 4.0, i64 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %div = fdiv <vscale x 2 x float> %x, splat (float 4.0)
   ret <vscale x 2 x float> %div
 }
 
diff --git a/llvm/test/Transforms/InstCombine/fmul.ll b/llvm/test/Transforms/InstCombine/fmul.ll
index fe99d7284487..7e7373e6ef5b 100644
--- a/llvm/test/Transforms/InstCombine/fmul.ll
+++ b/llvm/test/Transforms/InstCombine/fmul.ll
@@ -794,12 +794,12 @@ define <2 x float> @fmul_fadd_distribute_vec(<2 x float> %x) {
 
 define <vscale x 2 x float> @fmul_fadd_distribute_scalablevec(<vscale x 2 x float> %x) {
 ; CHECK-LABEL: @fmul_fadd_distribute_scalablevec(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul reassoc <vscale x 2 x float> [[X:%.*]], shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 6.000000e+03, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul reassoc <vscale x 2 x float> [[X:%.*]], shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 6.000000e+03, i64 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[T3:%.*]] = fadd reassoc <vscale x 2 x float> [[TMP1]], shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 1.200000e+07, i64 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x float> [[T3]]
 ;
-  %t1 = fadd <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 2.0e+3, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), %x
-  %t3 = fmul reassoc <vscale x 2 x float> %t1, shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 6.0e+3, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %t1 = fadd <vscale x 2 x float> splat (float 2.0e+3), %x
+  %t3 = fmul reassoc <vscale x 2 x float> %t1, splat (float 6.0e+3)
 
 
   ret <vscale x 2 x float> %t3
diff --git a/llvm/test/Transforms/InstCombine/icmp-vec.ll b/llvm/test/Transforms/InstCombine/icmp-vec.ll
index 38e30334342b..63dd535c527e 100644
--- a/llvm/test/Transforms/InstCombine/icmp-vec.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-vec.ll
@@ -394,11 +394,11 @@ define <2 x i1> @icmp_logical_or_vec(<2 x i64> %x, <2 x i64> %y, <2 x i1> %false
 define <vscale x 2 x i1> @icmp_logical_or_scalablevec(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y, <vscale x 2 x i1> %falseval) {
 ; CHECK-LABEL: @icmp_logical_or_scalablevec(
 ; CHECK-NEXT:    [[CMP_NE:%.*]] = icmp ne <vscale x 2 x i64> [[X:%.*]], zeroinitializer
-; CHECK-NEXT:    [[SEL:%.*]] = select <vscale x 2 x i1> [[CMP_NE]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> [[FALSEVAL:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select <vscale x 2 x i1> [[CMP_NE]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> [[FALSEVAL:%.*]]
 ; CHECK-NEXT:    ret <vscale x 2 x i1> [[SEL]]
 ;
   %cmp.ne = icmp ne <vscale x 2 x i64> %x, zeroinitializer
-  %sel = select <vscale x 2 x i1> %cmp.ne, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %falseval
+  %sel = select <vscale x 2 x i1> %cmp.ne, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i1> %falseval
   ret <vscale x 2 x i1> %sel
 }
 
diff --git a/llvm/test/Transforms/InstCombine/intrinsics.ll b/llvm/test/Transforms/InstCombine/intrinsics.ll
index 3ac7c728e198..d90b0ebd400c 100644
--- a/llvm/test/Transforms/InstCombine/intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/intrinsics.ll
@@ -129,9 +129,9 @@ define <vscale x 1 x i1> @cttz_knownbits_scalable_vec(<vscale x 1 x i32> %arg) {
 ; CHECK-LABEL: @cttz_knownbits_scalable_vec(
 ; CHECK-NEXT:    ret <vscale x 1 x i1> zeroinitializer
 ;
-  %or = or <vscale x 1 x i32> %arg, shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 4, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
+  %or = or <vscale x 1 x i32> %arg, splat (i32 4)
   %cnt = call <vscale x 1 x i32> @llvm.cttz.nxv1i32(<vscale x 1 x i32> %or, i1 true) nounwind readnone
-  %res = icmp eq <vscale x 1 x i32> %cnt, shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 4, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
+  %res = icmp eq <vscale x 1 x i32> %cnt, splat (i32 4)
   ret <vscale x 1 x i1> %res
 }
 
diff --git a/llvm/test/Transforms/InstCombine/load-store-forward.ll b/llvm/test/Transforms/InstCombine/load-store-forward.ll
index aa593b95b360..dbc68044c11a 100644
--- a/llvm/test/Transforms/InstCombine/load-store-forward.ll
+++ b/llvm/test/Transforms/InstCombine/load-store-forward.ll
@@ -108,7 +108,7 @@ define i32 @load_i32_store_nxv4i32(ptr %a) {
 ; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
-  store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), ptr %a, align 16
+  store <vscale x 4 x i32> splat (i32 1), ptr %a, align 16
   %0 = load i32, ptr %a, align 4
   ret i32 %0
 }
@@ -116,12 +116,12 @@ entry:
 define i64 @load_i64_store_nxv8i8(ptr %a) {
 ; CHECK-LABEL: @load_i64_store_nxv8i8(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 1, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), ptr [[A:%.*]], align 16
+; CHECK-NEXT:    store <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 1, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), ptr [[A:%.*]], align 16
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[A]], align 8
 ; CHECK-NEXT:    ret i64 [[LOAD]]
 ;
 entry:
-  store <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 1, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), ptr %a, align 16
+  store <vscale x 8 x i8> splat (i8 1), ptr %a, align 16
   %load = load i64, ptr %a, align 8
   ret i64 %load
 }
@@ -134,7 +134,7 @@ define i64 @load_i64_store_nxv4i32(ptr %a) {
 ; CHECK-NEXT:    ret i64 [[LOAD]]
 ;
 entry:
-  store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), ptr %a, align 16
+  store <vscale x 4 x i32> splat (i32 1), ptr %a, align 16
   %load = load i64, ptr %a, align 8
   ret i64 %load
 }
@@ -147,7 +147,7 @@ define i8 @load_i8_store_nxv4i32(ptr %a) {
 ; CHECK-NEXT:    ret i8 [[LOAD]]
 ;
 entry:
-  store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), ptr %a, align 16
+  store <vscale x 4 x i32> splat (i32 1), ptr %a, align 16
   %load = load i8, ptr %a, align 1
   ret i8 %load
 }
@@ -160,7 +160,7 @@ define float @load_f32_store_nxv4f32(ptr %a) {
 ; CHECK-NEXT:    ret float [[TMP0]]
 ;
 entry:
-  store <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.0, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), ptr %a, align 16
+  store <vscale x 4 x float> splat (float 1.0), ptr %a, align 16
   %0 = load float, ptr %a, align 4
   ret float %0
 }
@@ -173,7 +173,7 @@ define i32 @load_i32_store_nxv4f32(ptr %a) {
 ; CHECK-NEXT:    ret i32 [[LOAD]]
 ;
 entry:
-  store <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.0, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), ptr %a, align 16
+  store <vscale x 4 x float> splat (float 1.0), ptr %a, align 16
   %load = load i32, ptr %a, align 4
   ret i32 %load
 }
@@ -186,7 +186,7 @@ define <4 x i32> @load_v4i32_store_nxv4i32(ptr %a) {
 ; CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 ;
 entry:
-  store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), ptr %a, align 16
+  store <vscale x 4 x i32> splat (i32 1), ptr %a, align 16
   %0 = load <4 x i32>, ptr %a, align 16
   ret <4 x i32> %0
 }
@@ -199,7 +199,7 @@ define <4 x i16> @load_v4i16_store_nxv4i32(ptr %a) {
 ; CHECK-NEXT:    ret <4 x i16> [[TMP0]]
 ;
 entry:
-  store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), ptr %a, align 16
+  store <vscale x 4 x i32> splat (i32 1), ptr %a, align 16
   %0 = load <4 x i16>, ptr %a, align 16
   ret <4 x i16> %0
 }
@@ -208,12 +208,12 @@ entry:
 define i64 @load_i64_store_nxv4i8(ptr %a) {
 ; CHECK-LABEL: @load_i64_store_nxv4i8(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store <vscale x 4 x i8> shufflevector (<vscale x 4 x i8> insertelement (<vscale x 4 x i8> poison, i8 1, i32 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer), ptr [[A:%.*]], align 16
+; CHECK-NEXT:    store <vscale x 4 x i8> shufflevector (<vscale x 4 x i8> insertelement (<vscale x 4 x i8> poison, i8 1, i64 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer), ptr [[A:%.*]], align 16
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[A]], align 8
 ; CHECK-NEXT:    ret i64 [[LOAD]]
 ;
 entry:
-  store <vscale x 4 x i8> shufflevector (<vscale x 4 x i8> insertelement (<vscale x 4 x i8> poison, i8 1, i32 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer), ptr %a, align 16
+  store <vscale x 4 x i8> splat (i8 1), ptr %a, align 16
   %load = load i64, ptr %a, align 8
   ret i64 %load
 }
@@ -228,7 +228,7 @@ define <vscale x 4 x i8> @load_nxv4i8_store_nxv4i32(ptr %a) {
 ; CHECK-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 ;
 entry:
-  store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), ptr %a, align 16
+  store <vscale x 4 x i32> splat (i32 1), ptr %a, align 16
   %0 = load <vscale x 4 x i8>, ptr %a, align 16
   ret <vscale x 4 x i8> %0
 }
diff --git a/llvm/test/Transforms/InstCombine/logical-select.ll b/llvm/test/Transforms/InstCombine/logical-select.ll
index af1a3e1455e4..f0ea09c08847 100644
--- a/llvm/test/Transforms/InstCombine/logical-select.ll
+++ b/llvm/test/Transforms/InstCombine/logical-select.ll
@@ -480,7 +480,7 @@ define <vscale x 1 x i1> @vec_of_bools_scalable(<vscale x 1 x i1> %a, <vscale x
 ; CHECK-NEXT:    [[R:%.*]] = select <vscale x 1 x i1> [[A:%.*]], <vscale x 1 x i1> [[C:%.*]], <vscale x 1 x i1> [[D:%.*]]
 ; CHECK-NEXT:    ret <vscale x 1 x i1> [[R]]
 ;
-  %b = xor <vscale x 1 x i1> %a, shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+  %b = xor <vscale x 1 x i1> %a, splat (i1 true)
   %t11 = and <vscale x 1 x i1> %a, %c
   %t12 = and <vscale x 1 x i1> %b, %d
   %r = or <vscale x 1 x i1> %t11, %t12
@@ -513,7 +513,7 @@ define <vscale x 1 x i64> @vec_of_casted_bools_scalable(<vscale x 1 x i64> %a, <
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[OR]]
 ;
   %scond = sext <vscale x 8 x i1> %cond to <vscale x 8 x i8>
-  %notcond = xor <vscale x 8 x i1> %cond, shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i32 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+  %notcond = xor <vscale x 8 x i1> %cond, splat (i1 true)
   %snotcond = sext <vscale x 8 x i1> %notcond to <vscale x 8 x i8>
   %bc1 = bitcast <vscale x 8 x i8> %scond to <vscale x 1 x i64>
   %bc2 = bitcast <vscale x 8 x i8> %snotcond to <vscale x 1 x i64>
@@ -750,7 +750,7 @@ define <vscale x 2 x i64> @bitcast_vec_cond_scalable(<vscale x 16 x i1> %cond, <
 ;
   %s = sext <vscale x 16 x i1> %cond to <vscale x 16 x i8>
   %t9 = bitcast <vscale x 16 x i8> %s to <vscale x 2 x i64>
-  %nott9 = xor <vscale x 2 x i64> %t9, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 -1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %nott9 = xor <vscale x 2 x i64> %t9, splat (i64 -1)
   %t11 = and <vscale x 2 x i64> %nott9, %c
   %t12 = and <vscale x 2 x i64> %t9, %d
   %r = or <vscale x 2 x i64> %t11, %t12
diff --git a/llvm/test/Transforms/InstCombine/masked_intrinsics.ll b/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
index 2704905f7a35..15ffc881b573 100644
--- a/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
@@ -300,7 +300,7 @@ entry:
   %broadcast.splat = shufflevector <vscale x 4 x ptr> %broadcast.splatinsert, <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
   %broadcast.value = insertelement <vscale x 4 x i16> poison, i16 %val, i32 0
   %broadcast.splatvalue = shufflevector <vscale x 4 x i16> %broadcast.value, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
-  call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> %broadcast.splatvalue, <vscale x 4 x ptr> %broadcast.splat, i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> zeroinitializer , i1 true, i32 0), <vscale x 4 x i1> zeroinitializer, <vscale x 4 x i32> zeroinitializer))
+  call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> %broadcast.splatvalue, <vscale x 4 x ptr> %broadcast.splat, i32 2, <vscale x 4 x i1> splat (i1 true))
   ret void
 }
 
@@ -336,7 +336,7 @@ entry:
   %broadcast.splatinsert = insertelement <vscale x 4 x ptr> poison, ptr %dst, i32 0
   %broadcast.splat = shufflevector <vscale x 4 x ptr> %broadcast.splatinsert, <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
   %wide.load = load <vscale x 4 x i16>, ptr %src, align 2
-  call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> %wide.load, <vscale x 4 x ptr> %broadcast.splat, i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+  call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> %wide.load, <vscale x 4 x ptr> %broadcast.splat, i32 2, <vscale x 4 x i1> splat (i1 true))
   ret void
 }
 
@@ -389,7 +389,7 @@ define <vscale x 2 x i64> @gather_nxv2i64_uniform_ptrs_all_active_mask(ptr %src)
 ;
   %broadcast.splatinsert = insertelement <vscale x 2 x ptr> poison, ptr %src, i32 0
   %broadcast.splat = shufflevector <vscale x 2 x ptr> %broadcast.splatinsert, <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
-  %res = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x ptr> %broadcast.splat, i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> undef)
+  %res = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x ptr> %broadcast.splat, i32 8, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> undef)
   ret <vscale x 2 x i64> %res
 }
 
diff --git a/llvm/test/Transforms/InstCombine/mul-masked-bits.ll b/llvm/test/Transforms/InstCombine/mul-masked-bits.ll
index bc0bee8fc603..da7cc2db0978 100644
--- a/llvm/test/Transforms/InstCombine/mul-masked-bits.ll
+++ b/llvm/test/Transforms/InstCombine/mul-masked-bits.ll
@@ -86,7 +86,7 @@ define <vscale x 2 x i32> @combine_mul_self_demandedbits_vector2(<vscale x 2 x i
 ;
   %1 = freeze <vscale x 2 x i32> %x
   %2 = mul <vscale x 2 x i32> %1, %1
-  %3 = and <vscale x 2 x i32> %2, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 -3, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %3 = and <vscale x 2 x i32> %2, splat (i32 -3)
   ret <vscale x 2 x i32> %3
 }
 
diff --git a/llvm/test/Transforms/InstCombine/rem-mul-shl.ll b/llvm/test/Transforms/InstCombine/rem-mul-shl.ll
index 75f4d3f4fb07..9e2df157c2c8 100644
--- a/llvm/test/Transforms/InstCombine/rem-mul-shl.ll
+++ b/llvm/test/Transforms/InstCombine/rem-mul-shl.ll
@@ -35,8 +35,8 @@ define <vscale x 16 x i8> @urem_XY_XZ_with_CY_rem_CZ_eq_0_scalable(<vscale x 16
 ; CHECK-LABEL: @urem_XY_XZ_with_CY_rem_CZ_eq_0_scalable(
 ; CHECK-NEXT:    ret <vscale x 16 x i8> zeroinitializer
 ;
-  %BO0 = mul nuw <vscale x 16 x i8> %X, shufflevector(<vscale x 16 x i8> insertelement(<vscale x 16 x i8> poison, i8 15, i64 0) , <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
-  %BO1 = mul <vscale x 16 x i8> %X, shufflevector(<vscale x 16 x i8> insertelement(<vscale x 16 x i8> poison, i8 5, i64 0) , <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %BO0 = mul nuw <vscale x 16 x i8> %X, splat (i8 15)
+  %BO1 = mul <vscale x 16 x i8> %X, splat (i8 5)
   %r = urem <vscale x 16 x i8> %BO0, %BO1
   ret <vscale x 16 x i8> %r
 }
@@ -253,8 +253,8 @@ define <vscale x 16 x i8> @srem_XY_XZ_with_CY_rem_CZ_eq_0_scalable(<vscale x 16
 ; CHECK-LABEL: @srem_XY_XZ_with_CY_rem_CZ_eq_0_scalable(
 ; CHECK-NEXT:    ret <vscale x 16 x i8> zeroinitializer
 ;
-  %BO0 = mul nsw <vscale x 16 x i8> %X, shufflevector(<vscale x 16 x i8> insertelement(<vscale x 16 x i8> poison, i8 15, i64 0) , <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
-  %BO1 = mul <vscale x 16 x i8> %X, shufflevector(<vscale x 16 x i8> insertelement(<vscale x 16 x i8> poison, i8 5, i64 0) , <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %BO0 = mul nsw <vscale x 16 x i8> %X, splat (i8 15)
+  %BO1 = mul <vscale x 16 x i8> %X, splat (i8 5)
   %r = srem <vscale x 16 x i8> %BO0, %BO1
   ret <vscale x 16 x i8> %r
 }
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index c5f1b77c6d74..82baf05977db 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -3400,11 +3400,11 @@ define i32 @select_cond_not_cond_cond2(i1 %cond) {
 ; scalable vector splat ConstantExprs.
 define <vscale x 2 x i32> @and_constant_select_svec(<vscale x 2 x i32> %x, <vscale x 2 x i1> %cond) {
 ; CHECK-LABEL: @and_constant_select_svec(
-; CHECK-NEXT:    [[A:%.*]] = and <vscale x 2 x i32> [[X:%.*]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[A:%.*]] = and <vscale x 2 x i32> [[X:%.*]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[B:%.*]] = select <vscale x 2 x i1> [[COND:%.*]], <vscale x 2 x i32> [[A]], <vscale x 2 x i32> [[X]]
 ; CHECK-NEXT:    ret <vscale x 2 x i32> [[B]]
 ;
-  %a = and <vscale x 2 x i32> %x, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %a = and <vscale x 2 x i32> %x, splat (i32 1)
   %b = select <vscale x 2 x i1> %cond, <vscale x 2 x i32> %a, <vscale x 2 x i32> %x
   ret <vscale x 2 x i32> %b
 }
@@ -3412,23 +3412,23 @@ define <vscale x 2 x i32> @and_constant_select_svec(<vscale x 2 x i32> %x, <vsca
 define <vscale x 2 x i32> @scalable_sign_bits(<vscale x 2 x i8> %x) {
 ; CHECK-LABEL: @scalable_sign_bits(
 ; CHECK-NEXT:    [[A:%.*]] = sext <vscale x 2 x i8> [[X:%.*]] to <vscale x 2 x i32>
-; CHECK-NEXT:    [[B:%.*]] = shl nsw <vscale x 2 x i32> [[A]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 16, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[B:%.*]] = shl nsw <vscale x 2 x i32> [[A]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 16, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i32> [[B]]
 ;
   %a = sext <vscale x 2 x i8> %x to <vscale x 2 x i32>
-  %b = shl <vscale x 2 x i32> %a, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 16, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %b = shl <vscale x 2 x i32> %a, splat (i32 16)
   ret <vscale x 2 x i32> %b
 }
 
 define <vscale x 2 x i1> @scalable_non_zero(<vscale x 2 x i32> %x) {
 ; CHECK-LABEL: @scalable_non_zero(
-; CHECK-NEXT:    [[A:%.*]] = or <vscale x 2 x i32> [[X:%.*]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[A:%.*]] = or <vscale x 2 x i32> [[X:%.*]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ule <vscale x 2 x i32> [[A]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 56, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i1> [[CMP]]
 ;
-  %a = or <vscale x 2 x i32> %x, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
-  %b = add <vscale x 2 x i32> %a, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 -1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
-  %cmp = icmp ult <vscale x 2 x i32> %b, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 56, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %a = or <vscale x 2 x i32> %x, splat (i32 1)
+  %b = add <vscale x 2 x i32> %a, splat (i32 -1)
+  %cmp = icmp ult <vscale x 2 x i32> %b, splat (i32 56)
   ret <vscale x 2 x i1> %cmp
 }
 
diff --git a/llvm/test/Transforms/InstCombine/shift.ll b/llvm/test/Transforms/InstCombine/shift.ll
index d783adbe9386..62f32c286837 100644
--- a/llvm/test/Transforms/InstCombine/shift.ll
+++ b/llvm/test/Transforms/InstCombine/shift.ll
@@ -1361,11 +1361,11 @@ define <2 x i8> @ashr_demanded_bits_splat(<2 x i8> %x) {
 
 define <vscale x 8 x i8> @ashr_demanded_bits_splat2(<vscale x 8 x i8> %x) {
 ; CHECK-LABEL: @ashr_demanded_bits_splat2(
-; CHECK-NEXT:    [[SHR:%.*]] = ashr <vscale x 8 x i8> [[X:%.*]], shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 7, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[SHR:%.*]] = ashr <vscale x 8 x i8> [[X:%.*]], shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 7, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 8 x i8> [[SHR]]
 ;
-  %and = and <vscale x 8 x i8> %x, shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 128, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
-  %shr = ashr <vscale x 8 x i8> %and, shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 7, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
+  %and = and <vscale x 8 x i8> %x, splat (i8 128)
+  %shr = ashr <vscale x 8 x i8> %and, splat (i8 7)
   ret <vscale x 8 x i8> %shr
 }
 
@@ -1381,11 +1381,11 @@ define <2 x i8> @lshr_demanded_bits_splat(<2 x i8> %x) {
 
 define <vscale x 8 x i8> @lshr_demanded_bits_splat2(<vscale x 8 x i8> %x) {
 ; CHECK-LABEL: @lshr_demanded_bits_splat2(
-; CHECK-NEXT:    [[SHR:%.*]] = lshr <vscale x 8 x i8> [[X:%.*]], shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 7, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[SHR:%.*]] = lshr <vscale x 8 x i8> [[X:%.*]], shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 7, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 8 x i8> [[SHR]]
 ;
-  %and = and <vscale x 8 x i8> %x, shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 128, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
-  %shr = lshr <vscale x 8 x i8> %and, shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 7, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
+  %and = and <vscale x 8 x i8> %x, splat (i8 128)
+  %shr = lshr <vscale x 8 x i8> %and, splat (i8 7)
   ret <vscale x 8 x i8> %shr
 }
 
diff --git a/llvm/test/Transforms/InstCombine/sub.ll b/llvm/test/Transforms/InstCombine/sub.ll
index 494cdf62c797..76cd7ab5c10c 100644
--- a/llvm/test/Transforms/InstCombine/sub.ll
+++ b/llvm/test/Transforms/InstCombine/sub.ll
@@ -844,7 +844,7 @@ define <vscale x 2 x i32> @test44scalablevec(<vscale x 2 x i32> %x) {
 ; CHECK-NEXT:    [[SUB:%.*]] = add nsw <vscale x 2 x i32> [[X:%.*]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 -32768, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i32> [[SUB]]
 ;
-  %sub = sub nsw <vscale x 2 x i32> %x, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 32768, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer)
+  %sub = sub nsw <vscale x 2 x i32> %x, splat (i32 32768)
   ret <vscale x 2 x i32> %sub
 }
 
@@ -864,7 +864,7 @@ define <vscale x 2 x i16> @test44scalablevecminval(<vscale x 2 x i16> %x) {
 ; CHECK-NEXT:    [[SUB:%.*]] = add <vscale x 2 x i16> [[X:%.*]], shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 -32768, i64 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i16> [[SUB]]
 ;
-  %sub = sub nsw <vscale x 2 x i16> %x, shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> undef, i16 -32768, i32 0), <vscale x 2 x i16> undef, <vscale x 2 x i32> zeroinitializer)
+  %sub = sub nsw <vscale x 2 x i16> %x, splat (i16 -32768)
   ret <vscale x 2 x i16> %sub
 }
 
diff --git a/llvm/test/Transforms/InstCombine/udiv-simplify.ll b/llvm/test/Transforms/InstCombine/udiv-simplify.ll
index 41a4e642b4e8..bd6e5efc05f1 100644
--- a/llvm/test/Transforms/InstCombine/udiv-simplify.ll
+++ b/llvm/test/Transforms/InstCombine/udiv-simplify.ll
@@ -157,11 +157,11 @@ define i8 @udiv_exact_demanded_low_bits_clear(i8 %a) {
 
 define <vscale x 1 x i32> @udiv_demanded3(<vscale x 1 x i32> %a) {
 ; CHECK-LABEL: @udiv_demanded3(
-; CHECK-NEXT:    [[U:%.*]] = udiv <vscale x 1 x i32> [[A:%.*]], shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 12, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
+; CHECK-NEXT:    [[U:%.*]] = udiv <vscale x 1 x i32> [[A:%.*]], shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 12, i64 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 1 x i32> [[U]]
 ;
-  %o = or <vscale x 1 x i32> %a, shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 3, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
-  %u = udiv <vscale x 1 x i32> %o, shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 12, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
+  %o = or <vscale x 1 x i32> %a, splat (i32 3)
+  %u = udiv <vscale x 1 x i32> %o, splat (i32 12)
   ret <vscale x 1 x i32> %u
 }
 
diff --git a/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll
index 81f0a966dbc9..ef085d3e7b50 100644
--- a/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll
@@ -1468,14 +1468,14 @@ define <4 x i32> @splat_assoc_add(<4 x i32> %x, <4 x i32> %y) {
 
 define <vscale x 4 x i32> @vsplat_assoc_add(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
 ; CHECK-LABEL: @vsplat_assoc_add(
-; CHECK-NEXT:    [[TMP1:%.*]] = add <vscale x 4 x i32> [[X:%.*]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 317426, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP1:%.*]] = add <vscale x 4 x i32> [[X:%.*]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 317426, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[R:%.*]] = add <vscale x 4 x i32> [[TMP2]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[R]]
 ;
 
   %splatx = shufflevector <vscale x 4 x i32> %x, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-  %a = add <vscale x 4 x i32> %y, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 317426, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+  %a = add <vscale x 4 x i32> %y, splat (i32 317426)
   %r = add <vscale x 4 x i32> %splatx, %a
   ret <vscale x 4 x i32> %r
 }
diff --git a/llvm/test/Transforms/InstCombine/vec_shuffle.ll b/llvm/test/Transforms/InstCombine/vec_shuffle.ll
index 250a175ad0eb..919e30f672e4 100644
--- a/llvm/test/Transforms/InstCombine/vec_shuffle.ll
+++ b/llvm/test/Transforms/InstCombine/vec_shuffle.ll
@@ -1478,14 +1478,14 @@ define <4 x i32> @splat_assoc_add(<4 x i32> %x, <4 x i32> %y) {
 
 define <vscale x 4 x i32> @vsplat_assoc_add(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
 ; CHECK-LABEL: @vsplat_assoc_add(
-; CHECK-NEXT:    [[TMP1:%.*]] = add <vscale x 4 x i32> [[X:%.*]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 317426, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP1:%.*]] = add <vscale x 4 x i32> [[X:%.*]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 317426, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[R:%.*]] = add <vscale x 4 x i32> [[TMP2]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[R]]
 ;
 
   %splatx = shufflevector <vscale x 4 x i32> %x, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
-  %a = add <vscale x 4 x i32> %y, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 317426, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+  %a = add <vscale x 4 x i32> %y, splat (i32 317426)
   %r = add <vscale x 4 x i32> %splatx, %a
   ret <vscale x 4 x i32> %r
 }
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vscale-inseltpoison.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vscale-inseltpoison.ll
index 63b9e12abfde..e1c46a653f8d 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vscale-inseltpoison.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vscale-inseltpoison.ll
@@ -45,7 +45,7 @@ define <vscale x 4 x i32> @sub_splat() {
 ; CHECK-LABEL: @sub_splat(
 ; CHECK-NEXT:    ret <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -16, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ;
-  %r = sub <vscale x 4 x i32> zeroinitializer, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 16, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+  %r = sub <vscale x 4 x i32> zeroinitializer, splat (i32 16)
   ret <vscale x 4 x i32> %r
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vscale.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vscale.ll
index 585e736650b7..8ace257a8cae 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vscale.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vscale.ll
@@ -45,7 +45,7 @@ define <vscale x 4 x i32> @sub_splat() {
 ; CHECK-LABEL: @sub_splat(
 ; CHECK-NEXT:    ret <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -16, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ;
-  %r = sub <vscale x 4 x i32> zeroinitializer, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 16, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+  %r = sub <vscale x 4 x i32> zeroinitializer, splat (i32 16)
   ret <vscale x 4 x i32> %r
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/cmp-vec-fast-path.ll b/llvm/test/Transforms/InstSimplify/cmp-vec-fast-path.ll
index a07e6fae5d4f..169ecc3c2d37 100644
--- a/llvm/test/Transforms/InstSimplify/cmp-vec-fast-path.ll
+++ b/llvm/test/Transforms/InstSimplify/cmp-vec-fast-path.ll
@@ -31,7 +31,7 @@ define <vscale x 2 x i1> @i32cmp_eq_scalable_one() {
 ; CHECK-LABEL: @i32cmp_eq_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = icmp eq <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = icmp eq <vscale x 2 x i32> splat (i32 1), splat (i32 1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -63,7 +63,7 @@ define <vscale x 2 x i1> @i32cmp_ne_scalable_one() {
 ; CHECK-LABEL: @i32cmp_ne_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = icmp ne <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = icmp ne <vscale x 2 x i32> splat (i32 1), splat (i32 1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -95,7 +95,7 @@ define <vscale x 2 x i1> @i32cmp_ugt_scalable_one() {
 ; CHECK-LABEL: @i32cmp_ugt_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = icmp ugt <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = icmp ugt <vscale x 2 x i32> splat (i32 1), splat (i32 1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -127,7 +127,7 @@ define <vscale x 2 x i1> @i32cmp_uge_scalable_one() {
 ; CHECK-LABEL: @i32cmp_uge_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = icmp uge <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = icmp uge <vscale x 2 x i32> splat (i32 1), splat (i32 1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -159,7 +159,7 @@ define <vscale x 2 x i1> @i32cmp_ult_scalable_one() {
 ; CHECK-LABEL: @i32cmp_ult_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = icmp ult <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = icmp ult <vscale x 2 x i32> splat (i32 1), splat (i32 1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -191,7 +191,7 @@ define <vscale x 2 x i1> @i32cmp_ule_scalable_one() {
 ; CHECK-LABEL: @i32cmp_ule_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = icmp ule <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = icmp ule <vscale x 2 x i32> splat (i32 1), splat (i32 1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -223,7 +223,7 @@ define <vscale x 2 x i1> @i32cmp_sgt_scalable_one() {
 ; CHECK-LABEL: @i32cmp_sgt_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = icmp sgt <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = icmp sgt <vscale x 2 x i32> splat (i32 1), splat (i32 1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -255,7 +255,7 @@ define <vscale x 2 x i1> @i32cmp_sge_scalable_one() {
 ; CHECK-LABEL: @i32cmp_sge_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = icmp sge <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = icmp sge <vscale x 2 x i32> splat (i32 1), splat (i32 1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -287,7 +287,7 @@ define <vscale x 2 x i1> @i32cmp_slt_scalable_one() {
 ; CHECK-LABEL: @i32cmp_slt_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = icmp slt <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = icmp slt <vscale x 2 x i32> splat (i32 1), splat (i32 1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -319,7 +319,7 @@ define <vscale x 2 x i1> @i32cmp_sle_scalable_one() {
 ; CHECK-LABEL: @i32cmp_sle_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = icmp sle <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = icmp sle <vscale x 2 x i32> splat (i32 1), splat (i32 1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -351,7 +351,7 @@ define <vscale x 2 x i1> @floatcmp_false_scalable_one() {
 ; CHECK-LABEL: @floatcmp_false_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = fcmp false <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp false <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -383,7 +383,7 @@ define <vscale x 2 x i1> @floatcmp_oeq_scalable_one() {
 ; CHECK-LABEL: @floatcmp_oeq_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = fcmp oeq <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp oeq <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -415,7 +415,7 @@ define <vscale x 2 x i1> @floatcmp_ogt_scalable_one() {
 ; CHECK-LABEL: @floatcmp_ogt_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = fcmp ogt <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp ogt <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -447,7 +447,7 @@ define <vscale x 2 x i1> @floatcmp_oge_scalable_one() {
 ; CHECK-LABEL: @floatcmp_oge_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = fcmp oge <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp oge <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -479,7 +479,7 @@ define <vscale x 2 x i1> @floatcmp_olt_scalable_one() {
 ; CHECK-LABEL: @floatcmp_olt_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = fcmp olt <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp olt <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -511,7 +511,7 @@ define <vscale x 2 x i1> @floatcmp_ole_scalable_one() {
 ; CHECK-LABEL: @floatcmp_ole_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = fcmp ole <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp ole <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -543,7 +543,7 @@ define <vscale x 2 x i1> @floatcmp_one_scalable_one() {
 ; CHECK-LABEL: @floatcmp_one_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = fcmp one <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp one <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -575,7 +575,7 @@ define <vscale x 2 x i1> @floatcmp_ord_scalable_one() {
 ; CHECK-LABEL: @floatcmp_ord_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = fcmp ord <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp ord <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -607,7 +607,7 @@ define <vscale x 2 x i1> @floatcmp_ueq_scalable_one() {
 ; CHECK-LABEL: @floatcmp_ueq_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = fcmp ueq <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp ueq <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -639,7 +639,7 @@ define <vscale x 2 x i1> @floatcmp_ugt_scalable_one() {
 ; CHECK-LABEL: @floatcmp_ugt_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = fcmp ugt <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp ugt <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -671,7 +671,7 @@ define <vscale x 2 x i1> @floatcmp_uge_scalable_one() {
 ; CHECK-LABEL: @floatcmp_uge_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = fcmp uge <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp uge <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -703,7 +703,7 @@ define <vscale x 2 x i1> @floatcmp_ult_scalable_one() {
 ; CHECK-LABEL: @floatcmp_ult_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = fcmp ult <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp ult <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -735,7 +735,7 @@ define <vscale x 2 x i1> @floatcmp_ule_scalable_one() {
 ; CHECK-LABEL: @floatcmp_ule_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = fcmp ule <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp ule <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -767,7 +767,7 @@ define <vscale x 2 x i1> @floatcmp_une_scalable_one() {
 ; CHECK-LABEL: @floatcmp_une_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = fcmp une <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp une <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -799,7 +799,7 @@ define <vscale x 2 x i1> @floatcmp_uno_scalable_one() {
 ; CHECK-LABEL: @floatcmp_uno_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = fcmp uno <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp uno <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -831,7 +831,7 @@ define <vscale x 2 x i1> @floatcmp_true_scalable_one() {
 ; CHECK-LABEL: @floatcmp_true_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = fcmp true <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp true <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/fp-nan.ll b/llvm/test/Transforms/InstSimplify/fp-nan.ll
index fad7a0593e37..cb0bed379078 100644
--- a/llvm/test/Transforms/InstSimplify/fp-nan.ll
+++ b/llvm/test/Transforms/InstSimplify/fp-nan.ll
@@ -53,7 +53,7 @@ define <vscale x 1 x float> @fsub_nan_op1_scalable_vec_0(<vscale x 1 x float> %x
 ; CHECK-LABEL: @fsub_nan_op1_scalable_vec_0(
 ; CHECK-NEXT:    ret <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 0x7FF9000000000000, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer)
 ;
-  %r = fsub <vscale x 1 x float> %x, shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 0x7FF1000000000000, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer)
+  %r = fsub <vscale x 1 x float> %x, splat (float 0x7FF1000000000000)
   ret <vscale x 1 x float> %r
 }
 
@@ -61,7 +61,7 @@ define <vscale x 1 x float> @fsub_nan_op1_scalable_vec_1(<vscale x 1 x float> %x
 ; CHECK-LABEL: @fsub_nan_op1_scalable_vec_1(
 ; CHECK-NEXT:    ret <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 0xFFF9000000000000, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer)
 ;
-  %r = fsub <vscale x 1 x float> %x, shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 0xFFF1000000000000, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer)
+  %r = fsub <vscale x 1 x float> %x, splat (float 0xFFF1000000000000)
   ret <vscale x 1 x float> %r
 }
 
@@ -87,7 +87,7 @@ define <vscale x 1 x double> @fmul_nan_op0_scalable_vec_0(<vscale x 1 x double>
 ; CHECK-LABEL: @fmul_nan_op0_scalable_vec_0(
 ; CHECK-NEXT:    ret <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0xFFF8000000000001, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
 ;
-  %r = fmul <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0xFFF0000000000001, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer), %x
+  %r = fmul <vscale x 1 x double> splat (double 0xFFF0000000000001), %x
   ret <vscale x 1 x double> %r
 }
 
@@ -95,7 +95,7 @@ define <vscale x 1 x double> @fmul_nan_op0_scalable_vec_1(<vscale x 1 x double>
 ; CHECK-LABEL: @fmul_nan_op0_scalable_vec_1(
 ; CHECK-NEXT:    ret <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0xFFF8DEADDEADDEAD, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
 ;
-  %r = fmul <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0xFFF0DEADDEADDEAD, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer), %x
+  %r = fmul <vscale x 1 x double> splat (double 0xFFF0DEADDEADDEAD), %x
   ret <vscale x 1 x double> %r
 }
 
@@ -113,7 +113,7 @@ define <vscale x 1 x double> @fmul_nan_op1_scalable_vec(<vscale x 1 x double> %x
 ; CHECK-LABEL: @fmul_nan_op1_scalable_vec(
 ; CHECK-NEXT:    ret <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0x7FF8000000000000, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
 ;
-  %r = fmul <vscale x 1 x double> %x, shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0x7FF8000000000000, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
+  %r = fmul <vscale x 1 x double> %x, splat (double 0x7FF8000000000000)
   ret <vscale x 1 x double> %r
 }
 
@@ -131,7 +131,7 @@ define <vscale x 1 x double> @fdivl_nan_op0_scalable_vec(<vscale x 1 x double> %
 ; CHECK-LABEL: @fdivl_nan_op0_scalable_vec(
 ; CHECK-NEXT:    ret <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0xFFF800000000000F, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
 ;
-  %r = fdiv <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0xFFF800000000000F, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer), %x
+  %r = fdiv <vscale x 1 x double> splat (double 0xFFF800000000000F), %x
   ret <vscale x 1 x double> %r
 }
 
@@ -149,7 +149,7 @@ define <vscale x 1 x half> @fdiv_nan_op1_scalable_vec(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: @fdiv_nan_op1_scalable_vec(
 ; CHECK-NEXT:    ret <vscale x 1 x half> shufflevector (<vscale x 1 x half> insertelement (<vscale x 1 x half> poison, half 0xH7FFF, i64 0), <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer)
 ;
-  %r = fdiv <vscale x 1 x half> %x, shufflevector (<vscale x 1 x half> insertelement (<vscale x 1 x half> poison, half 0xH7FFF, i64 0), <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer)
+  %r = fdiv <vscale x 1 x half> %x, splat (half 0xH7FFF)
   ret <vscale x 1 x half> %r
 }
 
@@ -221,7 +221,7 @@ define <vscale x 1 x double> @fneg_nan_2_scalable_vec() {
 ; CHECK-LABEL: @fneg_nan_2_scalable_vec(
 ; CHECK-NEXT:    ret <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0xFFF9234567890ABC, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
 ;
-  %r = fsub <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double -0.0, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer), shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0xFFF1234567890ABC, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
+  %r = fsub <vscale x 1 x double> splat (double -0.0), splat (double 0xFFF1234567890ABC)
   ret <vscale x 1 x double> %r
 }
 
@@ -239,7 +239,7 @@ define <vscale x 1 x double> @unary_fneg_nan_2_scalable_vec_0() {
 ; CHECK-NEXT:    [[R:%.*]] = fneg <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0xFFF1234567890ABC, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 1 x double> [[R]]
 ;
-  %r = fneg <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0xFFF1234567890ABC, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
+  %r = fneg <vscale x 1 x double> splat (double 0xFFF1234567890ABC)
   ret <vscale x 1 x double> %r
 }
 
@@ -249,7 +249,7 @@ define <vscale x 1 x double> @unary_fneg_nan_2_scalable_vec_1() {
 ; CHECK-NEXT:    [[R:%.*]] = fneg <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0x7FF0000000000001, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 1 x double> [[R]]
 ;
-  %r = fneg <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0x7FF0000000000001, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
+  %r = fneg <vscale x 1 x double> splat (double 0x7FF0000000000001)
   ret <vscale x 1 x double> %r
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/shift.ll b/llvm/test/Transforms/InstSimplify/shift.ll
index 83751765df83..b562c3c164d5 100644
--- a/llvm/test/Transforms/InstSimplify/shift.ll
+++ b/llvm/test/Transforms/InstSimplify/shift.ll
@@ -347,7 +347,7 @@ define <vscale x 4 x i16> @lshr_scalable_overshift(<vscale x 4 x i16> %va) {
 ; CHECK-LABEL: @lshr_scalable_overshift(
 ; CHECK-NEXT:    ret <vscale x 4 x i16> poison
 ;
-  %vc = lshr <vscale x 4 x i16> %va, shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> poison, i16 16, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer)
+  %vc = lshr <vscale x 4 x i16> %va, splat (i16 16)
   ret <vscale x 4 x i16> %vc
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/vscale-inseltpoison.ll b/llvm/test/Transforms/InstSimplify/vscale-inseltpoison.ll
index 5b34f417e409..bd9650bd2d08 100644
--- a/llvm/test/Transforms/InstSimplify/vscale-inseltpoison.ll
+++ b/llvm/test/Transforms/InstSimplify/vscale-inseltpoison.ll
@@ -134,7 +134,7 @@ define <vscale x 2 x i1> @cmp_le_smax_always_true(<vscale x 2 x i64> %x) {
 ; CHECK-LABEL: @cmp_le_smax_always_true(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %cmp = icmp sle <vscale x 2 x i64> %x, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 9223372036854775807, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %cmp = icmp sle <vscale x 2 x i64> %x, splat (i64 9223372036854775807)
   ret <vscale x 2 x i1> %cmp
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/vscale.ll b/llvm/test/Transforms/InstSimplify/vscale.ll
index 66f629aac3a5..768c5f4ba9ea 100644
--- a/llvm/test/Transforms/InstSimplify/vscale.ll
+++ b/llvm/test/Transforms/InstSimplify/vscale.ll
@@ -146,7 +146,7 @@ define <vscale x 2 x i1> @cmp_le_smax_always_true(<vscale x 2 x i64> %x) {
 ; CHECK-LABEL: @cmp_le_smax_always_true(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %cmp = icmp sle <vscale x 2 x i64> %x, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 9223372036854775807, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %cmp = icmp sle <vscale x 2 x i64> %x, splat (i64 9223372036854775807)
   ret <vscale x 2 x i1> %cmp
 }
 
diff --git a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
index 2b9f777e9f3b..97ea2c6708da 100644
--- a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
+++ b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
@@ -1,3 +1,4 @@
+; REQUIRES: x86-registered-target
 ; RUN: opt -aa-pipeline=basic-aa -passes=loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S \
 ; RUN:   < %s | FileCheck %s
 
@@ -18,6 +19,7 @@
 ;   }
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
 
 @B = common global ptr null, align 8
 @A = common global ptr null, align 8
@@ -78,6 +80,7 @@ entry:
 
 
 ; VECTORIZE: mul <4 x i32>
+; VECTORIZE: mul <4 x i32>
 ; VECTORIZE-NOT: mul <4 x i32>
 
 for.body:                                         ; preds = %for.body, %entry
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
index 94f24fea3609..13fc0eaafb80 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
@@ -117,10 +117,10 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA_NO_LANEMASK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; DATA_NO_LANEMASK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; DATA_NO_LANEMASK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX]], 1
-; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; DATA_NO_LANEMASK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; DATA_NO_LANEMASK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
 ; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT4]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; DATA_NO_LANEMASK-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll
index 7121c85dd59b..c12b3b122ba7 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -passes=loop-vectorize -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s
-; RUN: opt < %s -passes='loop-vectorize' -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -S -mcpu=pwr9 2>&1 | FileCheck %s
+; RUN: opt < %s -passes='loop-vectorize' -S -mcpu=pwr9 2>&1 | FileCheck %s
 
 ; CHECK-LABEL: vector.body
 ; CHECK: load double, ptr
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 1bcd7a2e009e..da6dc34e4096 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -120,7 +120,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
-; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
+; CHECK:       LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.
 ;
 entry:
@@ -260,7 +260,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
-; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
+; CHECK:       LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
index be83329d30fe..51d264820503 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -142,7 +142,7 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_116]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT18:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT17]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY19:%.*]]
-; CHECK:       vector.body19:
+; CHECK:       vector.body17:
 ; CHECK-NEXT:    [[INDEX20:%.*]] = phi i64 [ 0, [[VECTOR_PH9]] ], [ [[INDEX_NEXT31:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[I_0_LCSSA]], [[INDEX20]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX20]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
index 15e77f3a4847..c04178a1c13e 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
@@ -91,8 +91,6 @@ define void @test_chained_first_order_recurrences_3(ptr %ptr) {
 ; CHECK-NEXT: middle.block:
 ; CHECK-NEXT: No successors
 ; CHECK-NEXT: }
-
-; CHECK-NOT: vector.body:
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll b/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll
new file mode 100644
index 000000000000..9968c933494a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll
@@ -0,0 +1,159 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s
+
+; FIXME: The vectorizer should refuse to fold the tail by masking because
+; %conv is used outside of the loop. Test for this by checking that
+; %n.vec, the vector trip count, is rounded down to the next multiple of
+; 4. If folding the tail, it would have been rounded up instead.
+; Test case for #76069(https://github.com/llvm/llvm-project/issues/76069).
+define i32 @test(ptr %arr, i64 %n) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i64 [[N]], 1
+; CHECK-NEXT:    br i1 [[CMP1]], label [[PREHEADER:%.*]], label [[DONE:%.*]]
+; CHECK:       preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[N]], -2
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i8
+; CHECK-NEXT:    [[TMP3:%.*]] = add i8 1, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i8 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP1]], 255
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP1]] to i8
+; CHECK-NEXT:    [[TMP8:%.*]] = add i8 2, [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult i8 [[TMP8]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[TMP1]], 255
+; CHECK-NEXT:    [[TMP11:%.*]] = or i1 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or i1 [[TMP6]], [[TMP11]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 1, [[N_VEC]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i8
+; CHECK-NEXT:    [[IND_END1:%.*]] = add i8 1, [[DOTCAST]]
+; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE10:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1, i64 2, i64 3, i64 4>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE10]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT3]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT4]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1>
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP17]], i32 0
+; CHECK-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP20]]
+; CHECK-NEXT:    store i32 65, ptr [[TMP21]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i1> [[TMP17]], i32 1
+; CHECK-NEXT:    br i1 [[TMP22]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; CHECK:       pred.store.if5:
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP18]], i32 1
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP23]]
+; CHECK-NEXT:    store i32 65, ptr [[TMP24]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; CHECK:       pred.store.continue6:
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i1> [[TMP17]], i32 2
+; CHECK-NEXT:    br i1 [[TMP25]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
+; CHECK:       pred.store.if7:
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP18]], i32 2
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP26]]
+; CHECK-NEXT:    store i32 65, ptr [[TMP27]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.continue8:
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP17]], i32 3
+; CHECK-NEXT:    br i1 [[TMP28]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10]]
+; CHECK:       pred.store.if9:
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP18]], i32 3
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP29]]
+; CHECK-NEXT:    store i32 65, ptr [[TMP30]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
+; CHECK:       pred.store.continue10:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMO:%.*]] = sub i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = add i64 1, [[CMO]]
+; CHECK-NEXT:    br i1 true, label [[LOAD_VAL:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ], [ 1, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i8 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ], [ 1, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[CONV:%.*]] = phi i64 [ [[CONV2:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i8 [ [[INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i64 [[CONV]], -1
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[SUB]]
+; CHECK-NEXT:    store i32 65, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[INC]] = add i8 [[I]], 1
+; CHECK-NEXT:    [[CONV2]] = zext i8 [[INC]] to i64
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i64 [[CONV2]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP2]], label [[LOOP]], label [[LOAD_VAL]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       load_val:
+; CHECK-NEXT:    [[FINAL:%.*]] = phi i64 [ [[CONV]], [[LOOP]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[FINAL]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[PTR2]], align 4
+; CHECK-NEXT:    br label [[DONE]]
+; CHECK:       done:
+; CHECK-NEXT:    [[VALUE:%.*]] = phi i32 [ [[VAL]], [[LOAD_VAL]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i32 [[VALUE]]
+;
+entry:
+  %cmp1 = icmp ugt i64 %n, 1
+  br i1 %cmp1, label %preheader, label %done
+
+preheader:
+  br label %loop
+
+loop:
+  %conv = phi i64 [ %conv2, %loop ], [ 1, %preheader ]
+  %i = phi i8 [ %inc, %loop ], [ 1, %preheader ]
+  %sub = add nsw i64 %conv, -1
+  %ptr = getelementptr inbounds i32, ptr %arr, i64 %sub
+  store i32 65, ptr %ptr, align 4
+  %inc = add i8 %i, 1
+  %conv2 = zext i8 %inc to i64
+  %cmp2 = icmp ult i64 %conv2, %n
+  br i1 %cmp2, label %loop, label %load_val, !llvm.loop !0
+
+load_val:
+  %final = phi i64 [ %conv, %loop ]
+  %ptr2 = getelementptr inbounds i32, ptr %arr, i64 %final
+  %val = load i32, ptr %ptr2, align 4
+  br label %done
+
+done:
+  %value = phi i32 [ %val, %load_val ], [ 0, %entry ]
+  ret i32 %value
+
+}
+
+!0 = distinct !{!0, !1, !2, !3}
+!1 = !{!"llvm.loop.unroll.disable"}
+!2 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.unroll.disable"}
+; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
new file mode 100644
index 000000000000..1dddbfe20a2e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
@@ -0,0 +1,90 @@
+; RUN: opt -passes=loop-vectorize -force-vector-width=8 -force-vector-interleave=2 -disable-output -debug -S %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; REQUIRES: asserts
+
+; Check if the vector loop condition can be simplified to true for a given
+; VF/IC combination.
+define void @test_tc_less_than_16(ptr %A, i64 %N) {
+; CHECK:      LV: Scalarizing:  %cmp =
+; CHECK-NEXT: VPlan 'Initial VPlan for VF={8},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ph:
+; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (zext i4 (trunc i64 %N to i4) to i64)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
+; CHECK-NEXT:     EMIT ir<%p.src> = WIDEN-POINTER-INDUCTION ir<%A>, 1
+; CHECK-NEXT:     vp<[[VPTR:%.]]> = vector-pointer ir<%p.src>
+; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VPTR]]>
+; CHECK-NEXT:     WIDEN ir<%add> = add nsw ir<%l>, ir<10>
+; CHECK-NEXT:     vp<[[VPTR2:%.+]]> = vector-pointer ir<%p.src>
+; CHECK-NEXT:     WIDEN store vp<[[VPTR2]]>, ir<%add>
+; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV:%.+]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:     EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+; CHECK: Executing best plan with VF=8, UF=2
+; CHECK-NEXT: VPlan 'Final VPlan for VF={8},UF={2}' {
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ph:
+; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (zext i4 (trunc i64 %N to i4) to i64)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
+; CHECK-NEXT:     EMIT ir<%p.src> = WIDEN-POINTER-INDUCTION ir<%A>, 1
+; CHECK-NEXT:     vp<[[VPTR:%.]]> = vector-pointer ir<%p.src>
+; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VPTR]]>
+; CHECK-NEXT:     WIDEN ir<%add> = add nsw ir<%l>, ir<10>
+; CHECK-NEXT:     vp<[[VPTR2:%.+]]> = vector-pointer ir<%p.src>
+; CHECK-NEXT:     WIDEN store vp<[[VPTR2]]>, ir<%add>
+; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV:%.+]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:     EMIT branch-on-cond ir<true>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+  %and = and i64 %N, 15
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %and, %entry ], [ %iv.next, %loop ]
+  %p.src = phi ptr [ %A, %entry ], [ %p.src.next, %loop ]
+  %p.src.next = getelementptr inbounds i8, ptr %p.src, i64 1
+  %l = load i8, ptr %p.src, align 1
+  %add = add nsw i8 %l, 10
+  store i8 %add, ptr %p.src
+  %iv.next = add nsw i64 %iv, -1
+  %cmp = icmp eq i64 %iv.next, 0
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll b/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll
index adab2238ab5b..dee0af0cc09e 100644
--- a/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll
+++ b/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll
@@ -71,7 +71,7 @@ define void @callslotoptzn(<vscale x 4 x float> %val, ptr %out) {
 ; CHECK-NEXT:    [[ALLOC:%.*]] = alloca <vscale x 4 x float>, align 16
 ; CHECK-NEXT:    [[IDX:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
 ; CHECK-NEXT:    [[STRIDE:%.*]] = getelementptr inbounds float, ptr [[ALLOC]], <vscale x 4 x i32> [[IDX]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[VAL:%.*]], <vscale x 4 x ptr> [[STRIDE]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[VAL:%.*]], <vscale x 4 x ptr> [[STRIDE]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[LI:%.*]] = load <vscale x 4 x float>, ptr [[ALLOC]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x float> [[LI]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
@@ -79,7 +79,7 @@ define void @callslotoptzn(<vscale x 4 x float> %val, ptr %out) {
   %alloc = alloca <vscale x 4 x float>, align 16
   %idx = tail call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
   %stride = getelementptr inbounds float, ptr %alloc, <vscale x 4 x i32> %idx
-  call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> %val, <vscale x 4 x ptr> %stride, i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+  call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> %val, <vscale x 4 x ptr> %stride, i32 4, <vscale x 4 x i1> splat (i1 true))
   %li = load <vscale x 4 x float>, ptr %alloc, align 4
   store <vscale x 4 x float> %li, ptr %out, align 4
   ret void
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/inlined3.ll b/llvm/test/Transforms/MemProfContextDisambiguation/inlined3.ll
new file mode 100644
index 000000000000..39a595897c37
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/inlined3.ll
@@ -0,0 +1,193 @@
+;; This test ensures that the logic which assigns calls to stack nodes
+;; correctly handles an inlined callsite with stack ids that partially
+;; overlap with a trimmed context. In particular when it also partially
+;; overlaps with a longer non-trimmed context that doesn't match all of
+;; the inlined callsite stack ids.
+
+;; The profile data and call stacks were all manually added, but the code
+;; would be structured something like the following (fairly contrived to
+;; result in the type of control flow needed to test):
+
+;; void A(bool b) {
+;;   if (b)
+;;     // cold: stack ids 6, 2, 8 (trimmed ids 10)
+;;     // not cold: stack ids 6, 7 (trimmed ids 9, 11)
+;;     new char[10]; // stack id 6
+;;   else
+;;     // not cold: stack ids 1, 2, 8, 3, 4
+;;     // cold: stack ids 1, 2, 8, 3, 5
+;;     new char[10]; // stack id 1
+;; }
+;;
+;; void XZ() {
+;;   A(false); // stack ids 2, 8 (e.g. X inlined into Z)
+;; }
+;;
+;; void XZN() {
+;;   // This is the tricky one to get right. We want to ensure it gets
+;;   // correctly correlated with a stack node for the trimmed 6, 2, 8
+;;   // context shown in A. It should *not* be correlated with the longer
+;;   // untrimmed 1, 2, 8, 3, 4|5 contexts.
+;;   A(true); // stack ids 2, 8, 9 (e.g. X inlined into Z inlined into N)
+;; }
+;;
+;; void Y() {
+;;   A(true); // stack id 7
+;; }
+;;
+;; void M() {
+;;   XZ(); // stack id 3
+;; }
+;;
+;; int main() {
+;;   M(); // stack id 4 (leads to not cold allocation)
+;;   M(); // stack id 5 (leads to cold allocation)
+;;   XZN(); // stack id 11 (leads to cold allocation)
+;;   Y(); // stack id 10 (leads to not cold allocation)
+;; }
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:	-memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -stats -pass-remarks=memprof-context-disambiguation \
+; RUN:	%s -S 2>&1 | FileCheck %s --check-prefix=IR \
+; RUN:  --check-prefix=STATS --check-prefix=REMARKS
+
+; REMARKS: created clone _Z1Ab.memprof.1
+; REMARKS: created clone _Z2XZv.memprof.1
+; REMARKS: created clone _Z1Mv.memprof.1
+;; Make sure the inlined context in _Z3XZNv, which partially overlaps
+;; trimmed cold context, and also partially overlaps completely
+;; unrelated contexts, correctly calls a cloned version of Z1Ab,
+;; which will call the cold annotated allocation.
+; REMARKS: call in clone _Z3XZNv assigned to call function clone _Z1Ab.memprof.1
+; REMARKS: call in clone main assigned to call function clone _Z1Mv.memprof.1
+; REMARKS: call in clone _Z1Mv.memprof.1 assigned to call function clone _Z2XZv.memprof.1
+; REMARKS: call in clone _Z2XZv.memprof.1 assigned to call function clone _Z1Ab
+; REMARKS: call in clone main assigned to call function clone _Z1Mv
+; REMARKS: call in clone _Z1Mv assigned to call function clone _Z2XZv
+; REMARKS: call in clone _Z2XZv assigned to call function clone _Z1Ab.memprof.1
+; REMARKS: call in clone _Z1Ab.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: call in clone _Z1Yv assigned to call function clone _Z1Ab
+; REMARKS: call in clone _Z1Ab marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z1Ab marked with memprof allocation attribute cold
+; REMARKS: call in clone _Z1Ab.memprof.1 marked with memprof allocation attribute notcold
+
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local void @_Z1Ab(i1 noundef zeroext %b) {
+entry:
+  br i1 %b, label %if.then, label %if.else
+
+if.then:
+  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !5, !callsite !11
+  br label %if.end
+
+if.else:
+  %call2 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !0, !callsite !10
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; Function Attrs: nobuiltin
+declare ptr @_Znam(i64) #0
+
+define dso_local void @_Z2XZv() local_unnamed_addr #0 {
+entry:
+  tail call void @_Z1Ab(i1 noundef zeroext false), !callsite !12
+  ret void
+}
+
+define dso_local void @_Z1Mv() local_unnamed_addr #0 {
+entry:
+  tail call void @_Z2XZv(), !callsite !19
+  ret void
+}
+
+define dso_local void @_Z3XZNv() local_unnamed_addr {
+entry:
+  tail call void @_Z1Ab(i1 noundef zeroext true), !callsite !15
+  ret void
+}
+
+define dso_local void @_Z1Yv() local_unnamed_addr {
+entry:
+  tail call void @_Z1Ab(i1 noundef zeroext true), !callsite !17
+  ret void
+}
+
+define dso_local noundef i32 @main() local_unnamed_addr {
+entry:
+  tail call void @_Z1Mv(), !callsite !13 ;; Not cold context
+  tail call void @_Z1Mv(), !callsite !14 ;; Cold context
+  tail call void @_Z3XZNv(), !callsite !16 ;; Cold context
+  tail call void @_Z1Yv(), !callsite !18 ;; Not cold context
+  ret i32 0
+}
+
+attributes #0 = { nobuiltin }
+attributes #7 = { builtin }
+
+!0 = !{!1, !3}
+;; Not cold context via first call to _Z1Mv in main
+!1 = !{!2, !"notcold"}
+!2 = !{i64 1, i64 2, i64 8, i64 3, i64 4}
+;; Cold context via second call to _Z1Mv in main
+!3 = !{!4, !"cold"}
+!4 = !{i64 1, i64 2, i64 8, i64 3, i64 5}
+!5 = !{!6, !8}
+;; Cold (trimmed) context via call to _Z3XZNv in main
+!6 = !{!7, !"cold"}
+!7 = !{i64 6, i64 2, i64 8}
+;; Not cold (trimmed) context via call to _Z1Yv in main
+!8 = !{!9, !"notcold"}
+!9 = !{i64 6, i64 7}
+!10 = !{i64 1}
+!11 = !{i64 6}
+!12 = !{i64 2, i64 8}
+!13 = !{i64 4}
+!14 = !{i64 5}
+;; Inlined context in _Z3XZNv, which includes part of trimmed cold context
+!15 = !{i64 2, i64 8, i64 9}
+!16 = !{i64 11}
+!17 = !{i64 7}
+!18 = !{i64 10}
+!19 = !{i64 3}
+
+; IR: define {{.*}} @_Z1Ab(i1 noundef zeroext %b)
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR: define {{.*}} @_Z2XZv()
+; IR:   call {{.*}} @_Z1Ab.memprof.1(i1 noundef zeroext false)
+; IR: define {{.*}} @_Z1Mv()
+; IR:   call {{.*}} @_Z2XZv()
+;; Make sure the inlined context in _Z3XZNv, which partially overlaps
+;; trimmed cold context, and also partially overlaps completely
+;; unrelated contexts, correctly calls the cloned version of Z1Ab
+;; that will call the cold annotated allocation.
+; IR: define {{.*}} @_Z3XZNv()
+; IR:   call {{.*}} @_Z1Ab.memprof.1(i1 noundef zeroext true)
+; IR: define {{.*}} @_Z1Yv()
+; IR:   call {{.*}} @_Z1Ab(i1 noundef zeroext true)
+; IR: define {{.*}} @main()
+; IR:   call {{.*}} @_Z1Mv()
+; IR:   call {{.*}} @_Z1Mv.memprof.1()
+; IR:   call {{.*}} @_Z3XZNv()
+; IR:   call {{.*}} @_Z1Yv()
+; IR: define {{.*}} @_Z1Ab.memprof.1(i1 noundef zeroext %b)
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[COLD]]
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]]
+; IR: define {{.*}} @_Z2XZv.memprof.1()
+; IR:   call {{.*}} @_Z1Ab(i1 noundef zeroext false)
+; IR: define {{.*}} @_Z1Mv.memprof.1()
+; IR:   call {{.*}} @_Z2XZv.memprof.1()
+
+; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/thinlto_indirect_call_promotion.profraw b/llvm/test/Transforms/PGOProfile/Inputs/thinlto_indirect_call_promotion.profraw
index 5efda10bb98a..3daa98f937b6 100644
--- a/llvm/test/Transforms/PGOProfile/Inputs/thinlto_indirect_call_promotion.profraw
+++ b/llvm/test/Transforms/PGOProfile/Inputs/thinlto_indirect_call_promotion.profraw
diff --git a/llvm/test/Transforms/PGOProfile/comdat_internal.ll b/llvm/test/Transforms/PGOProfile/comdat_internal.ll
index 8c6942c0f527..1bad0db1b476 100644
--- a/llvm/test/Transforms/PGOProfile/comdat_internal.ll
+++ b/llvm/test/Transforms/PGOProfile/comdat_internal.ll
@@ -13,9 +13,9 @@ $foo = comdat any
 ; CHECK: @__llvm_profile_raw_version = hidden constant i64 {{[0-9]+}}, comdat
 ; CHECK-NOT: __profn__stdin__foo
 ; CHECK: @__profc__stdin__foo.[[#FOO_HASH]] = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8
-; CHECK: @__profd__stdin__foo.[[#FOO_HASH]] = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 [[#FOO_HASH]], i64 sub (i64 ptrtoint (ptr @__profc__stdin__foo.742261418966908927 to i64), i64 ptrtoint (ptr @__profd__stdin__foo.742261418966908927 to i64)), i64 0, ptr null
+; CHECK: @__profd__stdin__foo.[[#FOO_HASH]] = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 [[#FOO_HASH]], i64 sub (i64 ptrtoint (ptr @__profc__stdin__foo.742261418966908927 to i64), i64 ptrtoint (ptr @__profd__stdin__foo.742261418966908927 to i64)), i64 0, ptr null
 ; CHECK-NOT: @foo
-; CHECK-SAME: , ptr null, i32 1, [2 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc__stdin__foo.[[#FOO_HASH]]), align 8
+; CHECK-SAME: , ptr null, i32 1, [3 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc__stdin__foo.[[#FOO_HASH]]), align 8
 ; CHECK: @__llvm_prf_nm
 ; CHECK: @llvm.compiler.used
 
diff --git a/llvm/test/Transforms/SROA/phi-and-select.ll b/llvm/test/Transforms/SROA/phi-and-select.ll
index 54cfb10793a1..7c8b27c9de9c 100644
--- a/llvm/test/Transforms/SROA/phi-and-select.ll
+++ b/llvm/test/Transforms/SROA/phi-and-select.ll
@@ -114,13 +114,13 @@ define i32 @test3(i32 %x) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    switch i32 [[X:%.*]], label [[BB0:%.*]] [
-; CHECK-NEXT:    i32 1, label [[BB1:%.*]]
-; CHECK-NEXT:    i32 2, label [[BB2:%.*]]
-; CHECK-NEXT:    i32 3, label [[BB3:%.*]]
-; CHECK-NEXT:    i32 4, label [[BB4:%.*]]
-; CHECK-NEXT:    i32 5, label [[BB5:%.*]]
-; CHECK-NEXT:    i32 6, label [[BB6:%.*]]
-; CHECK-NEXT:    i32 7, label [[BB7:%.*]]
+; CHECK-NEXT:      i32 1, label [[BB1:%.*]]
+; CHECK-NEXT:      i32 2, label [[BB2:%.*]]
+; CHECK-NEXT:      i32 3, label [[BB3:%.*]]
+; CHECK-NEXT:      i32 4, label [[BB4:%.*]]
+; CHECK-NEXT:      i32 5, label [[BB5:%.*]]
+; CHECK-NEXT:      i32 6, label [[BB6:%.*]]
+; CHECK-NEXT:      i32 7, label [[BB7:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb0:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
@@ -733,6 +733,7 @@ define void @PR20822(i1 %c1, i1 %c2, ptr %ptr) {
 ; CHECK-LABEL: @PR20822(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[F_SROA_0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[F1_SROA_GEP:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[PTR:%.*]], i32 0, i32 0
 ; CHECK-NEXT:    br i1 [[C1:%.*]], label [[IF_END:%.*]], label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    br label [[IF_END]]
@@ -742,9 +743,8 @@ define void @PR20822(i1 %c1, i1 %c2, ptr %ptr) {
 ; CHECK:       if.then2:
 ; CHECK-NEXT:    br label [[IF_THEN5]]
 ; CHECK:       if.then5:
-; CHECK-NEXT:    [[F1:%.*]] = phi ptr [ [[PTR:%.*]], [[IF_THEN2]] ], [ [[F_SROA_0]], [[IF_END]] ]
-; CHECK-NEXT:    [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[F1]], i32 0, i32 0
-; CHECK-NEXT:    store i32 0, ptr [[DOTFCA_0_GEP]], align 4
+; CHECK-NEXT:    [[F1_SROA_PHI:%.*]] = phi ptr [ [[F1_SROA_GEP]], [[IF_THEN2]] ], [ [[F_SROA_0]], [[IF_END]] ]
+; CHECK-NEXT:    store i32 0, ptr [[F1_SROA_PHI]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SROA/phi-gep.ll b/llvm/test/Transforms/SROA/phi-gep.ll
index c5aa1cdd9cf6..78071dcdafb4 100644
--- a/llvm/test/Transforms/SROA/phi-gep.ll
+++ b/llvm/test/Transforms/SROA/phi-gep.ll
@@ -65,15 +65,13 @@ end:
 define i32 @test_sroa_phi_gep_poison(i1 %cond) {
 ; CHECK-LABEL: @test_sroa_phi_gep_poison(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = alloca [[PAIR:%.*]], align 4
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[END:%.*]]
 ; CHECK:       if.then:
+; CHECK-NEXT:    [[PHI_SROA_PHI_SROA_SPECULATE_LOAD_IF_THEN:%.*]] = load i32, ptr poison, align 4
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ poison, [[IF_THEN]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], ptr [[PHI]], i32 0, i32 1
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4
-; CHECK-NEXT:    ret i32 [[LOAD]]
+; CHECK-NEXT:    [[PHI_SROA_PHI_SROA_SPECULATED:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[PHI_SROA_PHI_SROA_SPECULATE_LOAD_IF_THEN]], [[IF_THEN]] ]
+; CHECK-NEXT:    ret i32 [[PHI_SROA_PHI_SROA_SPECULATED]]
 ;
 entry:
   %a = alloca %pair, align 4
@@ -94,17 +92,13 @@ end:
 define i32 @test_sroa_phi_gep_global(i1 %cond) {
 ; CHECK-LABEL: @test_sroa_phi_gep_global(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = alloca [[PAIR:%.*]], align 4
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds [[PAIR]], ptr [[A]], i32 0, i32 1
-; CHECK-NEXT:    store i32 1, ptr [[GEP_A]], align 4
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[END:%.*]]
 ; CHECK:       if.then:
+; CHECK-NEXT:    [[PHI_SROA_PHI_SROA_SPECULATE_LOAD_IF_THEN:%.*]] = load i32, ptr getelementptr inbounds ([[PAIR:%.*]], ptr @g, i32 0, i32 1), align 4
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ @g, [[IF_THEN]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], ptr [[PHI]], i32 0, i32 1
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4
-; CHECK-NEXT:    ret i32 [[LOAD]]
+; CHECK-NEXT:    [[PHI_SROA_PHI_SROA_SPECULATED:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[PHI_SROA_PHI_SROA_SPECULATE_LOAD_IF_THEN]], [[IF_THEN]] ]
+; CHECK-NEXT:    ret i32 [[PHI_SROA_PHI_SROA_SPECULATED]]
 ;
 entry:
   %a = alloca %pair, align 4
@@ -245,7 +239,7 @@ define i32 @test_sroa_invoke_phi_gep(i1 %cond) personality ptr @__gxx_personalit
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[CALL:%.*]], label [[END:%.*]]
 ; CHECK:       call:
 ; CHECK-NEXT:    [[B:%.*]] = invoke ptr @foo()
-; CHECK-NEXT:    to label [[END]] unwind label [[INVOKE_CATCH:%.*]]
+; CHECK-NEXT:            to label [[END]] unwind label [[INVOKE_CATCH:%.*]]
 ; CHECK:       end:
 ; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[B]], [[CALL]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], ptr [[PHI]], i32 0, i32 1
@@ -253,7 +247,7 @@ define i32 @test_sroa_invoke_phi_gep(i1 %cond) personality ptr @__gxx_personalit
 ; CHECK-NEXT:    ret i32 [[LOAD]]
 ; CHECK:       invoke_catch:
 ; CHECK-NEXT:    [[RES:%.*]] = landingpad { ptr, i32 }
-; CHECK-NEXT:    catch ptr null
+; CHECK-NEXT:            catch ptr null
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
@@ -468,10 +462,10 @@ define i32 @test_sroa_phi_gep_multiple_values_from_same_block(i32 %arg) {
 ; CHECK-LABEL: @test_sroa_phi_gep_multiple_values_from_same_block(
 ; CHECK-NEXT:  bb.1:
 ; CHECK-NEXT:    switch i32 [[ARG:%.*]], label [[BB_3:%.*]] [
-; CHECK-NEXT:    i32 1, label [[BB_2:%.*]]
-; CHECK-NEXT:    i32 2, label [[BB_2]]
-; CHECK-NEXT:    i32 3, label [[BB_4:%.*]]
-; CHECK-NEXT:    i32 4, label [[BB_4]]
+; CHECK-NEXT:      i32 1, label [[BB_2:%.*]]
+; CHECK-NEXT:      i32 2, label [[BB_2]]
+; CHECK-NEXT:      i32 3, label [[BB_4:%.*]]
+; CHECK-NEXT:      i32 4, label [[BB_4]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb.2:
 ; CHECK-NEXT:    br label [[BB_4]]
@@ -504,6 +498,117 @@ bb.4:                                                ; preds = %bb.1, %bb.1, %bb
   ret i32 %load
 }
 
+define i64 @test_phi_idx_mem2reg_const(i1 %arg) {
+; CHECK-LABEL: @test_phi_idx_mem2reg_const(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI_SROA_PHI_SROA_SPECULATED:%.*]] = phi i64 [ 2, [[BB1]] ], [ 3, [[BB2]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ 0, [[BB1]] ], [ 1, [[BB2]] ]
+; CHECK-NEXT:    ret i64 [[PHI_SROA_PHI_SROA_SPECULATED]]
+;
+bb:
+  %alloca = alloca [2 x i64], align 8
+  %gep1 = getelementptr inbounds i64, ptr %alloca, i64 1
+  store i64 2, ptr %alloca
+  store i64 3, ptr %gep1
+  br i1 %arg, label %bb1, label %bb2
+
+bb1:
+  br label %end
+
+bb2:
+  br label %end
+
+end:
+  %phi = phi i64 [ 0, %bb1 ], [ 1, %bb2 ]
+  %getelementptr = getelementptr inbounds i64, ptr %alloca, i64 %phi
+  %load = load i64, ptr %getelementptr
+  ret i64 %load
+}
+
+define i64 @test_phi_idx_mem2reg_not_const(i1 %arg, i64 %idx) {
+; CHECK-LABEL: @test_phi_idx_mem2reg_not_const(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [2 x i64], align 8
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[ALLOCA]], i64 1
+; CHECK-NEXT:    store i64 2, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    store i64 3, ptr [[GEP1]], align 4
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ 0, [[BB1]] ], [ [[IDX:%.*]], [[BB2]] ]
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr inbounds i64, ptr [[ALLOCA]], i64 [[PHI]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GETELEMENTPTR]], align 4
+; CHECK-NEXT:    ret i64 [[LOAD]]
+;
+bb:
+  %alloca = alloca [2 x i64], align 8
+  %gep1 = getelementptr inbounds i64, ptr %alloca, i64 1
+  store i64 2, ptr %alloca
+  store i64 3, ptr %gep1
+  br i1 %arg, label %bb1, label %bb2
+
+bb1:
+  br label %end
+
+bb2:
+  br label %end
+
+end:
+  %phi = phi i64 [ 0, %bb1 ], [ %idx, %bb2 ]
+  %getelementptr = getelementptr inbounds i64, ptr %alloca, i64 %phi
+  %load = load i64, ptr %getelementptr
+  ret i64 %load
+}
+
+define i64 @test_phi_mem2reg_pointer_op_is_non_const_gep(i1 %arg, i64 %idx) {
+; CHECK-LABEL: @test_phi_mem2reg_pointer_op_is_non_const_gep(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [2 x i64], align 8
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[ALLOCA]], i64 1
+; CHECK-NEXT:    store i64 2, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    store i64 3, ptr [[GEP1]], align 4
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ 0, [[BB1]] ], [ 1, [[BB2]] ]
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr inbounds i64, ptr [[ALLOCA]], i64 [[IDX:%.*]]
+; CHECK-NEXT:    [[GETELEMENTPTR2:%.*]] = getelementptr inbounds i64, ptr [[GETELEMENTPTR]], i64 [[PHI]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GETELEMENTPTR]], align 4
+; CHECK-NEXT:    ret i64 [[LOAD]]
+;
+bb:
+  %alloca = alloca [2 x i64], align 8
+  %gep1 = getelementptr inbounds i64, ptr %alloca, i64 1
+  store i64 2, ptr %alloca
+  store i64 3, ptr %gep1
+  br i1 %arg, label %bb1, label %bb2
+
+bb1:
+  br label %end
+
+bb2:
+  br label %end
+
+end:
+  %phi = phi i64 [ 0, %bb1 ], [ 1, %bb2 ]
+  %getelementptr = getelementptr inbounds i64, ptr %alloca, i64 %idx
+  %getelementptr2 = getelementptr inbounds i64, ptr %getelementptr, i64 %phi
+  %load = load i64, ptr %getelementptr
+  ret i64 %load
+}
+
 declare ptr @foo()
 
 declare i32 @__gxx_personality_v0(...)
diff --git a/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll b/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll
index e95aea4eb487..7dc0ba50c1f8 100644
--- a/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll
+++ b/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll
@@ -51,7 +51,7 @@ define <vscale x 1 x i64> @add_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.add.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.add.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -59,7 +59,7 @@ define <vscale x 1 x i64> @add_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.add.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.add.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -68,13 +68,13 @@ define <vscale x 1 x i64> @add_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <v
 ; ALL-LABEL: @add_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.add.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.add.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.add.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.add.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -94,7 +94,7 @@ define <vscale x 1 x i64> @sub_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.sub.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.sub.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -102,7 +102,7 @@ define <vscale x 1 x i64> @sub_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.sub.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.sub.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -111,13 +111,13 @@ define <vscale x 1 x i64> @sub_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <v
 ; ALL-LABEL: @sub_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.sub.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.sub.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.sub.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.sub.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -137,7 +137,7 @@ define <vscale x 1 x i64> @mul_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -145,7 +145,7 @@ define <vscale x 1 x i64> @mul_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -154,13 +154,13 @@ define <vscale x 1 x i64> @mul_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <v
 ; ALL-LABEL: @mul_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -180,7 +180,7 @@ define <vscale x 1 x i64> @sdiv_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -188,7 +188,7 @@ define <vscale x 1 x i64> @sdiv_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -197,13 +197,13 @@ define <vscale x 1 x i64> @sdiv_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 ; ALL-LABEL: @sdiv_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -244,7 +244,7 @@ define <vscale x 1 x i64> @udiv_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -252,7 +252,7 @@ define <vscale x 1 x i64> @udiv_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -261,13 +261,13 @@ define <vscale x 1 x i64> @udiv_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 ; ALL-LABEL: @udiv_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -308,7 +308,7 @@ define <vscale x 1 x i64> @srem_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.srem.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.srem.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -316,7 +316,7 @@ define <vscale x 1 x i64> @srem_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.srem.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.srem.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -325,13 +325,13 @@ define <vscale x 1 x i64> @srem_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 ; ALL-LABEL: @srem_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.srem.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.srem.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.srem.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.srem.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -372,7 +372,7 @@ define <vscale x 1 x i64> @urem_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -380,7 +380,7 @@ define <vscale x 1 x i64> @urem_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -389,13 +389,13 @@ define <vscale x 1 x i64> @urem_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 ; ALL-LABEL: @urem_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -612,7 +612,7 @@ define <vscale x 1 x i64> @ashr_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.ashr.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.ashr.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -620,7 +620,7 @@ define <vscale x 1 x i64> @ashr_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.ashr.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.ashr.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -629,13 +629,13 @@ define <vscale x 1 x i64> @ashr_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 ; ALL-LABEL: @ashr_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.ashr.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.ashr.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.ashr.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.ashr.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -655,7 +655,7 @@ define <vscale x 1 x i64> @lshr_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.lshr.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.lshr.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -663,7 +663,7 @@ define <vscale x 1 x i64> @lshr_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.lshr.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.lshr.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -672,13 +672,13 @@ define <vscale x 1 x i64> @lshr_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 ; ALL-LABEL: @lshr_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.lshr.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.lshr.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.lshr.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.lshr.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -698,7 +698,7 @@ define <vscale x 1 x i64> @shl_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.shl.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.shl.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -706,7 +706,7 @@ define <vscale x 1 x i64> @shl_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.shl.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.shl.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -715,13 +715,13 @@ define <vscale x 1 x i64> @shl_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <v
 ; ALL-LABEL: @shl_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.shl.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.shl.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.shl.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.shl.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -741,7 +741,7 @@ define <vscale x 1 x i64> @or_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y,
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.or.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.or.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -749,7 +749,7 @@ define <vscale x 1 x i64> @or_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y,
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.or.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.or.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -758,13 +758,13 @@ define <vscale x 1 x i64> @or_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <vs
 ; ALL-LABEL: @or_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.or.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.or.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.or.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.or.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -784,7 +784,7 @@ define <vscale x 1 x i64> @and_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.and.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.and.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -792,7 +792,7 @@ define <vscale x 1 x i64> @and_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.and.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.and.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -801,13 +801,13 @@ define <vscale x 1 x i64> @and_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <v
 ; ALL-LABEL: @and_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.and.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.and.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.and.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.and.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -827,7 +827,7 @@ define <vscale x 1 x i64> @xor_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -835,7 +835,7 @@ define <vscale x 1 x i64> @xor_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -844,13 +844,13 @@ define <vscale x 1 x i64> @xor_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <v
 ; ALL-LABEL: @xor_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -870,7 +870,7 @@ define <vscale x 1 x i64> @smin_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.smin.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.smin.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -878,7 +878,7 @@ define <vscale x 1 x i64> @smin_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.smin.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.smin.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -887,13 +887,13 @@ define <vscale x 1 x i64> @smin_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 ; ALL-LABEL: @smin_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.smin.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.smin.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.smin.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.smin.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -913,7 +913,7 @@ define <vscale x 1 x i64> @smax_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.smax.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.smax.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -921,7 +921,7 @@ define <vscale x 1 x i64> @smax_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.smax.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.smax.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -930,13 +930,13 @@ define <vscale x 1 x i64> @smax_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 ; ALL-LABEL: @smax_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.smax.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.smax.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.smax.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.smax.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -956,7 +956,7 @@ define <vscale x 1 x i64> @umin_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.umin.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.umin.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -964,7 +964,7 @@ define <vscale x 1 x i64> @umin_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.umin.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.umin.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -984,7 +984,7 @@ define <vscale x 1 x i64> @umax_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.umax.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.umax.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -992,7 +992,7 @@ define <vscale x 1 x i64> @umax_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.umax.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.umax.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -1001,13 +1001,13 @@ define <vscale x 1 x i64> @umax_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 ; ALL-LABEL: @umax_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.umax.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.umax.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.umax.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.umax.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -1027,7 +1027,7 @@ define <vscale x 1 x float> @fadd_nxv1f32_allonesmask(<vscale x 1 x float> %x, f
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
@@ -1035,7 +1035,7 @@ define <vscale x 1 x float> @fadd_nxv1f32_allonesmask(<vscale x 1 x float> %x, f
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float > %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1044,13 +1044,13 @@ define <vscale x 1 x float> @fadd_nxv1f32_anymask(<vscale x 1 x float> %x, float
 ; ALL-LABEL: @fadd_nxv1f32_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float> %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1070,7 +1070,7 @@ define <vscale x 1 x float> @fsub_nxv1f32_allonesmask(<vscale x 1 x float> %x, f
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
@@ -1078,7 +1078,7 @@ define <vscale x 1 x float> @fsub_nxv1f32_allonesmask(<vscale x 1 x float> %x, f
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float > %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1087,13 +1087,13 @@ define <vscale x 1 x float> @fsub_nxv1f32_anymask(<vscale x 1 x float> %x, float
 ; ALL-LABEL: @fsub_nxv1f32_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float> %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1113,7 +1113,7 @@ define <vscale x 1 x float> @fdiv_nxv1f32_allonesmask(<vscale x 1 x float> %x, f
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
@@ -1121,7 +1121,7 @@ define <vscale x 1 x float> @fdiv_nxv1f32_allonesmask(<vscale x 1 x float> %x, f
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float > %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1130,13 +1130,13 @@ define <vscale x 1 x float> @fdiv_nxv1f32_anymask(<vscale x 1 x float> %x, float
 ; ALL-LABEL: @fdiv_nxv1f32_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float> %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1156,7 +1156,7 @@ define <vscale x 1 x float> @frem_nxv1f32_allonesmask(<vscale x 1 x float> %x, f
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
@@ -1164,7 +1164,7 @@ define <vscale x 1 x float> @frem_nxv1f32_allonesmask(<vscale x 1 x float> %x, f
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float > %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
   }
@@ -1173,13 +1173,13 @@ define <vscale x 1 x float> @frem_nxv1f32_allonesmask(<vscale x 1 x float> %x, f
 ; ALL-LABEL: @frem_nxv1f32_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float> %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1199,7 +1199,7 @@ define <vscale x 1 x float> @fdiv_nxv1f32_allonesmask_knownvl(<vscale x 1 x floa
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 4)
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 4)
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 4)
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
@@ -1207,7 +1207,7 @@ define <vscale x 1 x float> @fdiv_nxv1f32_allonesmask_knownvl(<vscale x 1 x floa
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float > %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 4)
+  %3 = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 4)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 4)
   ret <vscale x 1 x float> %4
 }
@@ -1216,13 +1216,13 @@ define <vscale x 1 x float> @fdiv_nxv1f32_anymask_knownvl(<vscale x 1 x float> %
 ; ALL-LABEL: @fdiv_nxv1f32_anymask_knownvl(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 4)
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 4)
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 4)
 ; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float> %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 4)
+  %3 = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 4)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 4)
   ret <vscale x 1 x float> %4
 }
@@ -1242,7 +1242,7 @@ define <vscale x 1 x float> @frem_nxv1f32_allonesmask_knownvl(<vscale x 1 x floa
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 4)
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 4)
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 4)
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
@@ -1250,7 +1250,7 @@ define <vscale x 1 x float> @frem_nxv1f32_allonesmask_knownvl(<vscale x 1 x floa
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float > %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 4)
+  %3 = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 4)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 4)
   ret <vscale x 1 x float> %4
 }
@@ -1259,13 +1259,13 @@ define <vscale x 1 x float> @frem_nxv1f32_allonesmask_knownvl(<vscale x 1 x floa
 ; ALL-LABEL: @frem_nxv1f32_anymask_knownvl(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 4)
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 4)
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 4)
 ; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float> %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 4)
+  %3 = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 4)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 4)
   ret <vscale x 1 x float> %4
 }
@@ -1285,7 +1285,7 @@ define <vscale x 1 x float> @copysign_nxv1f32_allonesmask(<vscale x 1 x float> %
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.copysign.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.copysign.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
@@ -1293,7 +1293,7 @@ define <vscale x 1 x float> @copysign_nxv1f32_allonesmask(<vscale x 1 x float> %
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float > %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.copysign.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.copysign.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1302,13 +1302,13 @@ define <vscale x 1 x float> @copysign_nxv1f32_anymask(<vscale x 1 x float> %x, f
 ; ALL-LABEL: @copysign_nxv1f32_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.copysign.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.copysign.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float> %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.copysign.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.copysign.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1328,7 +1328,7 @@ define <vscale x 1 x float> @minnum_nxv1f32_allonesmask(<vscale x 1 x float> %x,
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.minnum.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.minnum.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
@@ -1336,7 +1336,7 @@ define <vscale x 1 x float> @minnum_nxv1f32_allonesmask(<vscale x 1 x float> %x,
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float > %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.minnum.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.minnum.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1345,13 +1345,13 @@ define <vscale x 1 x float> @minnum_nxv1f32_anymask(<vscale x 1 x float> %x, flo
 ; ALL-LABEL: @minnum_nxv1f32_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.minnum.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.minnum.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float> %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.minnum.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.minnum.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1371,7 +1371,7 @@ define <vscale x 1 x float> @maxnum_nxv1f32_allonesmask(<vscale x 1 x float> %x,
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.maxnum.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.maxnum.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
@@ -1379,7 +1379,7 @@ define <vscale x 1 x float> @maxnum_nxv1f32_allonesmask(<vscale x 1 x float> %x,
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float > %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.maxnum.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.maxnum.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1388,13 +1388,13 @@ define <vscale x 1 x float> @maxnum_nxv1f32_anymask(<vscale x 1 x float> %x, flo
 ; ALL-LABEL: @maxnum_nxv1f32_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.maxnum.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.maxnum.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float> %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.maxnum.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.maxnum.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1414,7 +1414,7 @@ define <vscale x 8 x i8> @add_nxv8i8_allonesmask(<vscale x 8 x i8> %x, i8 %y, i3
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 8 x i1> [[SPLAT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 8 x i8> poison, i8 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 8 x i8> [[TMP1]], <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> [[TMP2]], <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 42, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> [[TMP2]], <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 42, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i8> @llvm.vp.mul.nxv8i8(<vscale x 8 x i8> [[X:%.*]], <vscale x 8 x i8> [[TMP3]], <vscale x 8 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 8 x i8> [[TMP4]]
 ;
@@ -1422,7 +1422,7 @@ define <vscale x 8 x i8> @add_nxv8i8_allonesmask(<vscale x 8 x i8> %x, i8 %y, i3
   %mask = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
   %1 = insertelement <vscale x 8 x i8> poison, i8 %y, i64 0
   %2 = shufflevector <vscale x 8 x i8> %1, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
-  %3 = call <vscale x 8 x i8>  @llvm.vp.add.nxv8i8(<vscale x 8 x i8> %2, <vscale x 8 x i8> shufflevector(<vscale x 8 x i8> insertelement(<vscale x 8 x i8> poison, i8 42, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 8 x i8>  @llvm.vp.add.nxv8i8(<vscale x 8 x i8> %2, <vscale x 8 x i8> splat (i8 42), <vscale x 8 x i1> %mask, i32 %evl)
   %4 = call <vscale x 8 x i8> @llvm.vp.mul.nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %3,  <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i8> %4
 }
@@ -1431,13 +1431,13 @@ define <vscale x 8 x i8> @add_nxv8i8_anymask(<vscale x 8 x i8> %x, i8 %y, <vscal
 ; ALL-LABEL: @add_nxv8i8_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 8 x i8> poison, i8 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 8 x i8> [[TMP1]], <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> [[TMP2]], <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 42, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> [[TMP2]], <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 42, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i8> @llvm.vp.mul.nxv8i8(<vscale x 8 x i8> [[X:%.*]], <vscale x 8 x i8> [[TMP3]], <vscale x 8 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 8 x i8> [[TMP4]]
 ;
   %1 = insertelement <vscale x 8 x i8> poison, i8 %y, i64 0
   %2 = shufflevector <vscale x 8 x i8> %1, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
-  %3 = call <vscale x 8 x i8>  @llvm.vp.add.nxv8i8(<vscale x 8 x i8> %2, <vscale x 8 x i8> shufflevector(<vscale x 8 x i8> insertelement(<vscale x 8 x i8> poison, i8 42, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 8 x i8>  @llvm.vp.add.nxv8i8(<vscale x 8 x i8> %2, <vscale x 8 x i8> splat (i8 42), <vscale x 8 x i1> %mask, i32 %evl)
   %4 = call <vscale x 8 x i8> @llvm.vp.mul.nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %3,  <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i8> %4
 }
@@ -1457,7 +1457,7 @@ define <vscale x 8 x half> @fadd_nxv1f16_allonesmask(<vscale x 8 x half> %x, hal
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 8 x i1> [[SPLAT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 8 x half> poison, half [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH5140, i32 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH5140, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> [[X:%.*]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 8 x half> [[TMP4]]
 ;
@@ -1465,7 +1465,7 @@ define <vscale x 8 x half> @fadd_nxv1f16_allonesmask(<vscale x 8 x half> %x, hal
   %mask = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
   %1 = insertelement <vscale x 8 x half> poison, half %y, i64 0
   %2 = shufflevector <vscale x 8 x half> %1, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
-  %3 = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %2, <vscale x 8 x half> shufflevector(<vscale x 8 x half> insertelement(<vscale x 8 x half> poison, half 42.0, i32 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %2, <vscale x 8 x half> splat (half 42.0), <vscale x 8 x i1> %mask, i32 %evl)
   %4 = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %x, <vscale x 8 x half> %3, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x half> %4
 }
@@ -1474,13 +1474,13 @@ define <vscale x 8 x half> @fadd_nxv8f16_anymask(<vscale x 8 x half> %x, half %y
 ; ALL-LABEL: @fadd_nxv8f16_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 8 x half> poison, half [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH5140, i32 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH5140, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> [[X:%.*]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 8 x half> [[TMP4]]
 ;
   %1 = insertelement <vscale x 8 x half> poison, half %y, i64 0
   %2 = shufflevector <vscale x 8 x half> %1, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
-  %3 = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %2, <vscale x 8 x half> shufflevector(<vscale x 8 x half> insertelement(<vscale x 8 x half> poison, half 42.0, i32 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %2, <vscale x 8 x half> splat (half 42.0), <vscale x 8 x i1> %mask, i32 %evl)
   %4 = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %x, <vscale x 8 x half> %3, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x half> %4
 }
diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/loop-register.s b/llvm/test/tools/llvm-exegesis/X86/latency/loop-register.s
new file mode 100644
index 000000000000..81ca75251381
--- /dev/null
+++ b/llvm/test/tools/llvm-exegesis/X86/latency/loop-register.s
@@ -0,0 +1,12 @@
+# REQUIRES: exegesis-can-measure-latency, x86_64-linux
+
+# Test that specifying the loop register to use works as expected.
+
+# RUN: llvm-exegesis -mtriple=x86_64-unknown-unknown -mode=latency -snippets-file=%s | FileCheck %s
+
+# CHECK: measurements:
+
+# LLVM-EXEGESIS-DEFREG R11 ff
+# LLVM-EXEGESIS-LOOP-REGISTER R12
+
+addq $0xff, %r11
diff --git a/llvm/test/tools/llvm-mca/AArch64/Exynos/float-divide-multiply.s b/llvm/test/tools/llvm-mca/AArch64/Exynos/float-divide-multiply.s
index a24d8a279606..ecfd019452af 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Exynos/float-divide-multiply.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Exynos/float-divide-multiply.s
@@ -26,11 +26,11 @@ fsqrt	d11, d12
 # EM3-NEXT: Total uOps:        800
 
 # EM4-NEXT: Instructions:      1200
-# EM4-NEXT: Total Cycles:      575
+# EM4-NEXT: Total Cycles:      572
 # EM4-NEXT: Total uOps:        1200
 
 # EM5-NEXT: Instructions:      1200
-# EM5-NEXT: Total Cycles:      433
+# EM5-NEXT: Total Cycles:      434
 # EM5-NEXT: Total uOps:        1200
 
 # ALL:      Dispatch Width:    6
@@ -39,12 +39,12 @@ fsqrt	d11, d12
 # EM3-NEXT: IPC:               0.18
 # EM3-NEXT: Block RThroughput: 45.0
 
-# EM4-NEXT: uOps Per Cycle:    2.09
-# EM4-NEXT: IPC:               2.09
+# EM4-NEXT: uOps Per Cycle:    2.10
+# EM4-NEXT: IPC:               2.10
 # EM4-NEXT: Block RThroughput: 4.0
 
-# EM5-NEXT: uOps Per Cycle:    2.77
-# EM5-NEXT: IPC:               2.77
+# EM5-NEXT: uOps Per Cycle:    2.76
+# EM5-NEXT: IPC:               2.76
 # EM5-NEXT: Block RThroughput: 4.0
 
 # ALL:      Instruction Info:
diff --git a/llvm/test/tools/llvm-objcopy/ELF/set-symbol-visibility.test b/llvm/test/tools/llvm-objcopy/ELF/set-symbol-visibility.test
new file mode 100644
index 000000000000..de30ee09bfda
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/ELF/set-symbol-visibility.test
@@ -0,0 +1,311 @@
+# RUN: yaml2obj --docnum=1 %s -o %t.o
+# RUN: echo '.*' > %t.symbols.regex
+
+## Check that the visibility of all symbols is properly set to DEFAULT.
+# RUN: llvm-objcopy %t.o %t0.o --set-symbols-visibility=%t.symbols.regex=default --regex
+# RUN: llvm-readelf -s %t0.o | FileCheck %s --check-prefix=DEF
+
+# DEF-DAG: DEFAULT     1 default_local
+# DEF-DAG: DEFAULT     1 internal_local
+# DEF-DAG: DEFAULT     1 hidden_local
+# DEF-DAG: DEFAULT     1 protected_local
+# DEF-DAG: DEFAULT     1 default_global
+# DEF-DAG: DEFAULT     1 default_weak
+# DEF-DAG: DEFAULT     1 internal_global
+# DEF-DAG: DEFAULT     1 internal_weak
+# DEF-DAG: DEFAULT     1 hidden_global
+# DEF-DAG: DEFAULT     1 hidden_weak
+# DEF-DAG: DEFAULT     1 protected_global
+# DEF-DAG: DEFAULT     1 protected_weak
+
+## Check that the visibility of all symbols is properly set to HIDDEN.
+# RUN: llvm-objcopy %t.o %t1.o --set-symbols-visibility=%t.symbols.regex=hidden --regex
+# RUN: llvm-readelf -s %t1.o | FileCheck %s --check-prefix=HID
+
+# HID-DAG: HIDDEN      1 default_local
+# HID-DAG: HIDDEN      1 internal_local
+# HID-DAG: HIDDEN      1 hidden_local
+# HID-DAG: HIDDEN      1 protected_local
+# HID-DAG: HIDDEN      1 default_global
+# HID-DAG: HIDDEN      1 default_weak
+# HID-DAG: HIDDEN      1 internal_global
+# HID-DAG: HIDDEN      1 internal_weak
+# HID-DAG: HIDDEN      1 hidden_global
+# HID-DAG: HIDDEN      1 hidden_weak
+# HID-DAG: HIDDEN      1 protected_global
+# HID-DAG: HIDDEN      1 protected_weak
+
+## Check that the visibility of all symbols is properly set to PROTECTED.
+# RUN: llvm-objcopy %t.o %t2.o --set-symbols-visibility=%t.symbols.regex=protected --regex
+# RUN: llvm-readelf -s %t2.o | FileCheck %s --check-prefix=PRO
+
+# PRO-DAG: PROTECTED   1 default_local
+# PRO-DAG: PROTECTED   1 internal_local
+# PRO-DAG: PROTECTED   1 hidden_local
+# PRO-DAG: PROTECTED   1 protected_local
+# PRO-DAG: PROTECTED   1 default_global
+# PRO-DAG: PROTECTED   1 default_weak
+# PRO-DAG: PROTECTED   1 internal_global
+# PRO-DAG: PROTECTED   1 internal_weak
+# PRO-DAG: PROTECTED   1 hidden_global
+# PRO-DAG: PROTECTED   1 hidden_weak
+# PRO-DAG: PROTECTED   1 protected_global
+# PRO-DAG: PROTECTED   1 protected_weak
+
+## Check that the visibility of all symbols is properly set to INTERNAL.
+# RUN: llvm-objcopy %t.o %t3.o --set-symbols-visibility=%t.symbols.regex=internal --regex
+# RUN: llvm-readelf -s %t3.o | FileCheck %s --check-prefix=INT
+
+# INT-DAG: INTERNAL    1 default_local
+# INT-DAG: INTERNAL    1 internal_local
+# INT-DAG: INTERNAL    1 hidden_local
+# INT-DAG: INTERNAL    1 protected_local
+# INT-DAG: INTERNAL    1 default_global
+# INT-DAG: INTERNAL    1 default_weak
+# INT-DAG: INTERNAL    1 internal_global
+# INT-DAG: INTERNAL    1 internal_weak
+# INT-DAG: INTERNAL    1 hidden_global
+# INT-DAG: INTERNAL    1 hidden_weak
+# INT-DAG: INTERNAL    1 protected_global
+# INT-DAG: INTERNAL    1 protected_weak
+
+## Check that setting the visibility of certain symbols that were read from
+## a file does not affect other symbols.
+# RUN: echo -e "default_local\ninternal_local" > %t.symbol.list
+# RUN: llvm-objcopy %t.o %t4.o --set-symbols-visibility=%t.symbol.list=hidden
+# RUN: llvm-readelf -s %t4.o | FileCheck %s --check-prefix=FILE
+
+# FILE-DAG: HIDDEN      1 default_local
+# FILE-DAG: HIDDEN      1 internal_local
+## Unaffected symbols:
+# FILE-DAG: HIDDEN      1 hidden_local
+# FILE-DAG: PROTECTED   1 protected_local
+# FILE-DAG: DEFAULT     1 default_global
+# FILE-DAG: DEFAULT     1 default_weak
+# FILE-DAG: INTERNAL    1 internal_global
+# FILE-DAG: INTERNAL    1 internal_weak
+# FILE-DAG: HIDDEN      1 hidden_global
+# FILE-DAG: HIDDEN      1 hidden_weak
+# FILE-DAG: PROTECTED   1 protected_global
+# FILE-DAG: PROTECTED   1 protected_weak
+
+## Check that the visibility of a single symbol is set correctly,
+## and that no other symbols are affected.
+# RUN: llvm-objcopy %t.o %t5.o --set-symbol-visibility=default_local=hidden \
+# RUN:                         --set-symbol-visibility=internal_local=protected \
+# RUN:                         --set-symbol-visibility=hidden_local=internal \
+# RUN:                         --set-symbol-visibility=protected_local=default
+# RUN: llvm-readelf -s %t5.o | FileCheck %s --check-prefix=SINGLE
+
+# SINGLE-DAG: HIDDEN      1 default_local
+# SINGLE-DAG: PROTECTED   1 internal_local
+# SINGLE-DAG: INTERNAL    1 hidden_local
+# SINGLE-DAG: DEFAULT     1 protected_local
+## Unaffected symbols:
+# SINGLE-DAG: DEFAULT     1 default_global
+# SINGLE-DAG: DEFAULT     1 default_weak
+# SINGLE-DAG: INTERNAL    1 internal_global
+# SINGLE-DAG: INTERNAL    1 internal_weak
+# SINGLE-DAG: HIDDEN      1 hidden_global
+# SINGLE-DAG: HIDDEN      1 hidden_weak
+# SINGLE-DAG: PROTECTED   1 protected_global
+# SINGLE-DAG: PROTECTED   1 protected_weak
+
+## Check that the visibility of symbols specified by a regex are set correctly,
+## and that no other symbols are affected.
+# RUN: llvm-objcopy %t.o %t6.o --set-symbol-visibility='.*'_local=hidden --regex
+# RUN: llvm-readelf -s %t6.o | FileCheck %s --check-prefix=REGEX
+
+# REGEX-DAG: HIDDEN      1 default_local
+# REGEX-DAG: HIDDEN      1 internal_local
+# REGEX-DAG: HIDDEN      1 hidden_local
+# REGEX-DAG: HIDDEN      1 protected_local
+## Unaffected symbols:
+# REGEX-DAG: DEFAULT     1 default_global
+# REGEX-DAG: DEFAULT     1 default_weak
+# REGEX-DAG: INTERNAL    1 internal_global
+# REGEX-DAG: INTERNAL    1 internal_weak
+# REGEX-DAG: HIDDEN      1 hidden_global
+# REGEX-DAG: HIDDEN      1 hidden_weak
+# REGEX-DAG: PROTECTED   1 protected_global
+# REGEX-DAG: PROTECTED   1 protected_weak
+
+## Check that the visibility of symbols specified by a wildcard are set correctly,
+## and that no other symbols are affected.
+# RUN: llvm-objcopy %t.o %t7.o --set-symbol-visibility='*_local'=hidden --wildcard
+# RUN: llvm-readelf -s %t7.o | FileCheck %s --check-prefix=WILDCARD
+
+# WILDCARD-DAG: HIDDEN      1 default_local
+# WILDCARD-DAG: HIDDEN      1 internal_local
+# WILDCARD-DAG: HIDDEN      1 hidden_local
+# WILDCARD-DAG: HIDDEN      1 protected_local
+## Unaffected symbols:
+# WILDCARD-DAG: DEFAULT     1 default_global
+# WILDCARD-DAG: DEFAULT     1 default_weak
+# WILDCARD-DAG: INTERNAL    1 internal_global
+# WILDCARD-DAG: INTERNAL    1 internal_weak
+# WILDCARD-DAG: HIDDEN      1 hidden_global
+# WILDCARD-DAG: HIDDEN      1 hidden_weak
+# WILDCARD-DAG: PROTECTED   1 protected_global
+# WILDCARD-DAG: PROTECTED   1 protected_weak
+
+## Check that the latest option that matches the same symbols as any of the previous
+## options overwrites the visibility of these symbols.
+# RUN: echo -e '*_weak\n*_local' > %t.symbols.pattern
+# RUN: llvm-objcopy %t.o %t8.o --set-symbol-visibility='default_*'=hidden \
+# RUN:                         --set-symbol-visibility='internal_*'=hidden \
+# RUN:                         --set-symbols-visibility=%t.symbols.pattern=protected \
+# RUN:                         --wildcard
+# RUN: llvm-readelf -s %t8.o | FileCheck %s --check-prefix=REWRITE
+
+# REWRITE-DAG: PROTECTED   1 default_local
+# REWRITE-DAG: HIDDEN      1 default_global
+# REWRITE-DAG: PROTECTED   1 default_weak
+# REWRITE-DAG: PROTECTED   1 internal_local
+# REWRITE-DAG: HIDDEN      1 internal_global
+# REWRITE-DAG: PROTECTED   1 internal_weak
+# REWRITE-DAG: PROTECTED   1 hidden_local
+# REWRITE-DAG: PROTECTED   1 hidden_weak
+# REWRITE-DAG: PROTECTED   1 protected_local
+# REWRITE-DAG: PROTECTED   1 protected_weak
+## Unaffected symbols:
+# REWRITE-DAG: HIDDEN      1 hidden_global
+# REWRITE-DAG: PROTECTED   1 protected_global
+
+## Check that a symbol name with a special charater is treated as a plain name
+## when pattern matching options are not enabled.
+# RUN: yaml2obj --docnum=2 %s -o %t9.o
+# RUN: llvm-objcopy %t9.o --set-symbol-visibility='f*o'=hidden
+# RUN: llvm-readelf -s %t9.o | FileCheck %s --check-prefix=SPECIAL
+
+# SPECIAL-DAG: HIDDEN      1 f*o
+## Unaffected symbol:
+# SPECIAL-DAG: DEFAULT     1 foo
+
+# RUN: yaml2obj --docnum=3 %s -o %t10.o
+
+## Check that the visibility of undefined symbols can be changed as well.
+# RUN: llvm-objcopy %t10.o --set-symbol-visibility=foo=hidden
+# RUN: llvm-readelf -s %t10.o | FileCheck %s --check-prefix=UNDEF
+# UNDEF: HIDDEN    UND foo
+
+## Check that passing an invalid visibility type generates an error message.
+# RUN: echo 'foo' > %t.symbols
+# RUN: not llvm-objcopy %t10.o --set-symbols-visibility=%t.symbols=invalid-type 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=TYPE
+# RUN: not llvm-objcopy %t10.o --set-symbol-visibility=foo=invalid-type 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=TYPE
+# TYPE: error: 'invalid-type' is not a valid symbol visibility
+
+## Check that omitting the '=' character generates an error.
+# RUN: not llvm-objcopy %t10.o --set-symbols-visibility=%t.symbols,hidden 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=FORMAT -DOPTION=--set-symbols-visibility
+# RUN: not llvm-objcopy %t10.o --set-symbol-visibility=foo default 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=FORMAT -DOPTION=--set-symbol-visibility
+# FORMAT: error: bad format for [[OPTION]]
+
+## Check that using an invalid symbol pattern generates an error.
+# RUN: echo '*.' > %t.symbols.regex
+# RUN: not llvm-objcopy %t10.o --set-symbols-visibility=%t.symbols.regex=hidden --regex 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=SYMBOL
+# RUN: not llvm-objcopy %t10.o --set-symbol-visibility='*.'=default --regex 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=SYMBOL
+# SYMBOL: error: cannot compile regular expression '*.': repetition-operator operand invalid
+
+## Check passing an invalid filename generates an error.
+# RUN: not llvm-objcopy %t10.o --set-symbols-visibility=no_file=hidden 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=NO_FILE -DMSG=%errc_ENOENT
+# NO_FILE: error: 'no_file': [[MSG]]
+
+---
+!ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_X86_64
+Sections:
+  - Name:  .text
+    Type:  SHT_PROGBITS
+Symbols:
+  - Name:    default_local
+    Section: .text
+    Binding:  STB_LOCAL
+  - Name:    protected_local
+    Section: .text
+    Binding:  STB_LOCAL
+    Other:    [ STV_PROTECTED ]
+  - Name:    internal_local
+    Section: .text
+    Binding:  STB_LOCAL
+    Other:    [ STV_INTERNAL ]
+  - Name:    hidden_local
+    Section: .text
+    Binding:  STB_LOCAL
+    Other:    [ STV_HIDDEN ]
+  - Name:    default_weak
+    Section: .text
+    Binding:  STB_WEAK
+  - Name:    internal_weak
+    Section: .text
+    Binding:  STB_WEAK
+    Other:    [ STV_INTERNAL ]
+  - Name:    hidden_weak
+    Section: .text
+    Binding:  STB_WEAK
+    Other:    [ STV_HIDDEN ]
+  - Name:    protected_weak
+    Section: .text
+    Binding:  STB_WEAK
+    Other:    [ STV_PROTECTED ]
+  - Name:    default_global
+    Section: .text
+    Binding:  STB_GLOBAL
+  - Name:    internal_global
+    Section: .text
+    Binding:  STB_GLOBAL
+    Other:    [ STV_INTERNAL ]
+  - Name:    hidden_global
+    Section: .text
+    Binding:  STB_GLOBAL
+    Other:    [ STV_HIDDEN ]
+  - Name:    protected_global
+    Section: .text
+    Binding:  STB_GLOBAL
+    Other:    [ STV_PROTECTED ]
+  - Name:    ignored_name
+    Section: .text
+    Binding:  STB_GLOBAL
+    Other:    [ STV_INTERNAL ]
+...
+
+---
+!ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_X86_64
+Sections:
+  - Name:  .text
+    Type:  SHT_PROGBITS
+Symbols:
+  - Name:    f*o
+    Section: .text
+    Binding:  STB_LOCAL
+  - Name:    foo
+    Section: .text
+    Binding:  STB_LOCAL
+...
+
+---
+!ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_X86_64
+Symbols:
+  - Name:     foo
+    Binding:  STB_LOCAL
+...
diff --git a/llvm/test/tools/llvm-objdump/X86/elf-pgoanalysismap.yaml b/llvm/test/tools/llvm-objdump/X86/elf-pgoanalysismap.yaml
index 732fab3e2a37..4d1e5408d86d 100644
--- a/llvm/test/tools/llvm-objdump/X86/elf-pgoanalysismap.yaml
+++ b/llvm/test/tools/llvm-objdump/X86/elf-pgoanalysismap.yaml
@@ -47,7 +47,9 @@ Symbols:
 
 # RUN: yaml2obj %s --docnum=2 -o %t2
 # RUN: llvm-objdump %t2 -d --symbolize-operands --no-show-raw-insn --no-leading-addr | \
-# RUN:   FileCheck %s --check-prefix=ENTRYCOUNT-BLOCKFREQ
+# RUN:   FileCheck --match-full-lines --strict-whitespace %s --check-prefix=ENTRYCOUNT-BLOCKFREQ
+# RUN: llvm-objdump %t2 -d --symbolize-operands --pretty-pgo-analysis-map --no-show-raw-insn --no-leading-addr | \
+# RUN:   FileCheck --match-full-lines --strict-whitespace %s --check-prefix=ENTRYCOUNT-BLOCKFREQ-PRETTY
 
 --- !ELF
 FileHeader:
@@ -98,18 +100,28 @@ Symbols:
     Section: .text.foo
     Value:   0x0
 
-# ENTRYCOUNT-BLOCKFREQ: <foo>:
-# ENTRYCOUNT-BLOCKFREQ: <BB3> (Entry count: 1000, Frequency: 1000):
-# ENTRYCOUNT-BLOCKFREQ: <BB1> (Frequency: 133):
-# ENTRYCOUNT-BLOCKFREQ: <BB2> (Frequency: 18):
-# ENTRYCOUNT-BLOCKFREQ: <BB5> (Frequency: 1000):
+# ENTRYCOUNT-BLOCKFREQ:<foo>:
+# ENTRYCOUNT-BLOCKFREQ:<BB3> (Entry count: 1000, Frequency: 1000):
+# ENTRYCOUNT-BLOCKFREQ:<BB1> (Frequency: 133):
+# ENTRYCOUNT-BLOCKFREQ:<BB2> (Frequency: 18):
+# ENTRYCOUNT-BLOCKFREQ:<BB5> (Frequency: 1000):
+
+# ENTRYCOUNT-BLOCKFREQ-PRETTY:<foo>:
+# ENTRYCOUNT-BLOCKFREQ-PRETTY:<BB3> (Entry count: 1000, Frequency: 1.0):
+# ENTRYCOUNT-BLOCKFREQ-PRETTY:<BB1> (Frequency: 0.133):
+# ENTRYCOUNT-BLOCKFREQ-PRETTY:<BB2> (Frequency: 0.018):
+# ENTRYCOUNT-BLOCKFREQ-PRETTY:<BB5> (Frequency: 1.0):
 
 ## Check the case where we have entry points, block frequency, and branch
 ## proabability information.
 
 # RUN: yaml2obj %s --docnum=3 -o %t3
 # RUN: llvm-objdump %t3 -d --symbolize-operands --no-show-raw-insn --no-leading-addr | \
-# RUN:   FileCheck %s --check-prefix=ENTRY-FREQ-PROB
+# RUN:   FileCheck --match-full-lines --strict-whitespace %s --check-prefix=ENTRY-FREQ-PROB
+# RUN: llvm-objdump %t3 -d --symbolize-operands --pretty-pgo-analysis-map --no-show-raw-insn --no-leading-addr | \
+# RUN:   FileCheck --match-full-lines --strict-whitespace %s --check-prefix=ENTRY-FREQ-PROB-PRETTY
+# RUN: llvm-objdump %t3 -d --pretty-pgo-analysis-map --no-show-raw-insn --no-leading-addr 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=MISSING-SYMBOLIZE-OPERANDS
 
 --- !ELF
 FileHeader:
@@ -154,21 +166,21 @@ Sections:
           - BBFreq: 1000
             Successors:
             - ID:          1
-              BrProb:      0x22222222
+              BrProb:      0x10000000
             - ID:          2
-              BrProb:      0x33333333
+              BrProb:      0x15000000
             - ID:          3
-              BrProb:      0xaaaaaaaa
+              BrProb:      0x50000000
           - BBFreq: 133
             Successors:
             - ID:          2
-              BrProb:      0x11111111
+              BrProb:      0x10000000
             - ID:          3
-              BrProb:      0xeeeeeeee
+              BrProb:      0x70000000
           - BBFreq: 18
             Successors:
             - ID:          3
-              BrProb:      0xffffffff
+              BrProb:      0x80000000
           - BBFreq: 1000
             Successors:    []
 Symbols:
@@ -176,8 +188,16 @@ Symbols:
     Section: .text.foo
     Value:   0x0
 
-# ENTRY-FREQ-PROB: <foo>:
-# ENTRY-FREQ-PROB: <BB3> (Entry count: 1000, Frequency: 1000, Successors: BB1:22222222, BB2:33333333, BB3:aaaaaaaa):
-# ENTRY-FREQ-PROB: <BB1> (Frequency: 133, Successors: BB2:11111111, BB3:eeeeeeee):
-# ENTRY-FREQ-PROB: <BB2> (Frequency: 18, Successors: BB3:ffffffff):
-# ENTRY-FREQ-PROB: <BB5> (Frequency: 1000):
+# ENTRY-FREQ-PROB:<foo>:
+# ENTRY-FREQ-PROB:<BB3> (Entry count: 1000, Frequency: 1000, Successors: BB1:10000000, BB2:15000000, BB3:50000000):
+# ENTRY-FREQ-PROB:<BB1> (Frequency: 133, Successors: BB2:10000000, BB3:70000000):
+# ENTRY-FREQ-PROB:<BB2> (Frequency: 18, Successors: BB3:80000000):
+# ENTRY-FREQ-PROB:<BB5> (Frequency: 1000):
+
+# ENTRY-FREQ-PROB-PRETTY:<foo>:
+# ENTRY-FREQ-PROB-PRETTY:<BB3> (Entry count: 1000, Frequency: 1.0, Successors: BB1:[0x10000000 / 0x80000000 = 12.50%], BB2:[0x15000000 / 0x80000000 = 16.41%], BB3:[0x50000000 / 0x80000000 = 62.50%]):
+# ENTRY-FREQ-PROB-PRETTY:<BB1> (Frequency: 0.133, Successors: BB2:[0x10000000 / 0x80000000 = 12.50%], BB3:[0x70000000 / 0x80000000 = 87.50%]):
+# ENTRY-FREQ-PROB-PRETTY:<BB2> (Frequency: 0.018, Successors: BB3:[0x80000000 / 0x80000000 = 100.00%]):
+# ENTRY-FREQ-PROB-PRETTY:<BB5> (Frequency: 1.0):
+
+# MISSING-SYMBOLIZE-OPERANDS: warning: --symbolize-operands must be enabled for --pretty-pgo-analysis-map to have an effect
diff --git a/llvm/test/tools/llvm-profdata/Inputs/c-general.profraw b/llvm/test/tools/llvm-profdata/Inputs/c-general.profraw
index 9cd225587c92..a3e884343942 100644
--- a/llvm/test/tools/llvm-profdata/Inputs/c-general.profraw
+++ b/llvm/test/tools/llvm-profdata/Inputs/c-general.profraw
diff --git a/llvm/test/tools/llvm-profdata/Inputs/compressed.profraw b/llvm/test/tools/llvm-profdata/Inputs/compressed.profraw
index 9966729d92dd..e3f77e870d4d 100644
--- a/llvm/test/tools/llvm-profdata/Inputs/compressed.profraw
+++ b/llvm/test/tools/llvm-profdata/Inputs/compressed.profraw
diff --git a/llvm/test/tools/llvm-profdata/Inputs/thinlto_indirect_call_promotion.profraw b/llvm/test/tools/llvm-profdata/Inputs/thinlto_indirect_call_promotion.profraw
new file mode 100644
index 000000000000..84707ba2070a
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/Inputs/thinlto_indirect_call_promotion.profraw
diff --git a/llvm/test/tools/llvm-profdata/binary-ids-padding.test b/llvm/test/tools/llvm-profdata/binary-ids-padding.test
index eda63203a304..61881b69cfd5 100644
--- a/llvm/test/tools/llvm-profdata/binary-ids-padding.test
+++ b/llvm/test/tools/llvm-profdata/binary-ids-padding.test
@@ -10,10 +10,12 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
+// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
+// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
 // There will be 2 20-byte binary IDs, so the total Binary IDs size will be 64 bytes.
 //   2 * 8  binary ID sizes
 // + 2 * 20 binary IDs (of size 20)
@@ -32,6 +34,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Binary IDs - There are only two in this case that are 20 bytes.
 RUN: printf '\24\0\0\0\0\0\0\0' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/large-binary-id-size.test b/llvm/test/tools/llvm-profdata/large-binary-id-size.test
index 38b838e0d100..316a9a4c9df4 100644
--- a/llvm/test/tools/llvm-profdata/large-binary-id-size.test
+++ b/llvm/test/tools/llvm-profdata/large-binary-id-size.test
@@ -1,5 +1,5 @@
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\40\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -12,6 +12,8 @@ RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Check for a corrupted size being too large past the end of the file.
 RUN: printf '\7\7\7\7\7\7\7\7' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test b/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
index c967e850dbe3..8b686d5c50cb 100644
--- a/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
+++ b/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
@@ -10,10 +10,12 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
+// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
+// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -26,6 +28,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Data Section
 //
diff --git a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
index 2e747f81a6bf..089afad42062 100644
--- a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
+++ b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
@@ -10,10 +10,12 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
+// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
+// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -26,6 +28,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Data Section
 //
diff --git a/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test b/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
index 3c23bc7dd0f7..e404ba4210cc 100644
--- a/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
+++ b/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
@@ -10,10 +10,12 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
+// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
+// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -26,6 +28,8 @@ RUN: printf '\0\0\6\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\6\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Data Section
 //
diff --git a/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test b/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test
index 4a5c42843ff4..ee54bfb97856 100644
--- a/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test
+++ b/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test
@@ -1,5 +1,5 @@
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
 // We should fail on this because the binary IDs is not a multiple of 8 bytes.
 RUN: printf '\77\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -10,6 +10,8 @@ RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Binary IDs - There are only two in this case that are 20 bytes.
 RUN: printf '\24\0\0\0\0\0\0\0' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test b/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
index 2a92575ee340..dfa163f1f343 100644
--- a/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
+++ b/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
@@ -15,6 +15,8 @@ RUN: printf '\0\0\0\0\0\0\0\20' >> %t
 RUN: printf '\0\0\0\1\0\4\0\0' >> %t
 RUN: printf '\0\0\0\2\0\4\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: not llvm-profdata show %t -o /dev/null 2>&1 | FileCheck %s
 
diff --git a/llvm/test/tools/llvm-profdata/raw-32-bits-be.test b/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
index 8220361df6cf..63782c8b94d4 100644
--- a/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
+++ b/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
@@ -1,5 +1,6 @@
+// Header
 RUN: printf '\377lprofR\201' > %t
-RUN: printf '\0\0\0\0\0\0\0\11' >> %t
+RUN: printf '\0\0\0\0\0\0\0\12' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
@@ -12,6 +13,8 @@ RUN: printf '\0\0\0\0\1\0\0\0' >> %t
 RUN: printf '\0\0\0\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\2\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\134\370\302\114\333\030\275\254' >> %t
 RUN: printf '\0\0\0\0\0\0\0\1' >> %t
@@ -20,9 +23,8 @@ RUN: printf '\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\3' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\344\023\165\112\031\035\265\067' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
@@ -31,9 +33,8 @@ RUN: printf '\2\xff\xff\xd3' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\2' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\0\0\0\0\0\0\0\023' >> %t
 RUN: printf '\0\0\0\0\0\0\0\067' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-32-bits-le.test b/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
index 9352ae132380..e9569bec1178 100644
--- a/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
+++ b/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
@@ -1,5 +1,5 @@
 RUN: printf '\201Rforpl\377' > %t
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
@@ -12,6 +12,8 @@ RUN: printf '\0\0\0\1\0\0\0\0' >> %t
 RUN: printf '\0\0\0\3\0\0\0\0' >> %t
 RUN: printf '\0\0\0\2\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\254\275\030\333\114\302\370\134' >> %t
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t
@@ -20,9 +22,8 @@ RUN: printf '\0\0\0\3' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\3\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t
@@ -31,9 +32,8 @@ RUN: printf '\xd3\xff\xff\2' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t
 RUN: printf '\067\0\0\0\0\0\0\0' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-64-bits-be.test b/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
index c3e995add6ff..0bc579eec58a 100644
--- a/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
+++ b/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
@@ -1,5 +1,5 @@
 RUN: printf '\377lprofr\201' > %t
-RUN: printf '\0\0\0\0\0\0\0\11' >> %t
+RUN: printf '\0\0\0\0\0\0\0\12' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
@@ -12,6 +12,8 @@ RUN: printf '\0\0\0\1\0\4\0\0' >> %t
 RUN: printf '\0\0\0\3\0\4\0\0' >> %t
 RUN: printf '\0\0\0\2\0\4\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\134\370\302\114\333\030\275\254' >> %t
 RUN: printf '\0\0\0\0\0\0\0\1' >> %t
@@ -20,9 +22,8 @@ RUN: printf '\0\0\0\3\0\4\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\3' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\344\023\165\112\031\035\265\067' >> %t
 RUN: printf '\0\0\0\0\0\0\0\02' >> %t
@@ -31,9 +32,8 @@ RUN: printf '\0\0\0\3\0\3\xff\xc3' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\02' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\0\0\0\0\0\0\0\023' >> %t
 RUN: printf '\0\0\0\0\0\0\0\067' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-64-bits-le.test b/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
index 0b3ef2a89abe..ca9ea54c3f01 100644
--- a/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
+++ b/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
@@ -1,5 +1,5 @@
 RUN: printf '\201rforpl\377' > %t
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
@@ -12,6 +12,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t
 RUN: printf '\0\0\4\0\3\0\0\0' >> %t
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\254\275\030\333\114\302\370\134' >> %t
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t
@@ -20,9 +22,8 @@ RUN: printf '\0\0\4\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\3\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t
@@ -31,9 +32,8 @@ RUN: printf '\xc3\xff\3\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\02\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t
 RUN: printf '\067\0\0\0\0\0\0\0' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-two-profiles.test b/llvm/test/tools/llvm-profdata/raw-two-profiles.test
index f4a9aa8e1bbc..70a4210dea9f 100644
--- a/llvm/test/tools/llvm-profdata/raw-two-profiles.test
+++ b/llvm/test/tools/llvm-profdata/raw-two-profiles.test
@@ -1,5 +1,5 @@
 RUN: printf '\201rforpl\377' > %t-foo.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
@@ -12,6 +12,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 
 RUN: printf '\254\275\030\333\114\302\370\134' >> %t-foo.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
@@ -26,7 +28,7 @@ RUN: printf '\023\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\3\0foo\0\0\0' >> %t-foo.profraw
 
 RUN: printf '\201rforpl\377' > %t-bar.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
@@ -39,6 +41,8 @@ RUN: printf '\0\0\6\0\1\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\6\0\2\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t-bar.profraw
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t-bar.profraw
diff --git a/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-pgo-analysis-map.test b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-pgo-analysis-map.test
index e5a9400c670c..5faafd4d83b2 100644
--- a/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-pgo-analysis-map.test
+++ b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-pgo-analysis-map.test
@@ -3,17 +3,19 @@
 
 ## Check 64-bit:
 # RUN: yaml2obj %s -DBITS=64 -DADDR=0x999999999 -o %t1.x64.o
-# RUN: llvm-readobj %t1.x64.o --bb-addr-map 2>&1 | FileCheck %s -DADDR=0x999999999 -DFILE=%t1.x64.o --check-prefix=CHECK
+# RUN: llvm-readobj %t1.x64.o --bb-addr-map 2>&1 | FileCheck --match-full-lines %s -DADDR=0x999999999 -DFILE=%t1.x64.o --check-prefixes=CHECK,RAW
+# RUN: llvm-readobj %t1.x64.o --bb-addr-map --pretty-pgo-analysis-map 2>&1 | FileCheck --match-full-lines %s -DADDR=0x999999999 -DFILE=%t1.x64.o --check-prefixes=CHECK,PRETTY
 # RUN: llvm-readelf %t1.x64.o --bb-addr-map | FileCheck %s --check-prefix=GNU
+# RUN: llvm-readobj %t1.x64.o --pretty-pgo-analysis-map 2>&1 | FileCheck %s --check-prefix=PRETTY-NO-BAM
 
 ## Check 32-bit:
 # RUN: yaml2obj %s -DBITS=32 -o %t1.x32.o
-# RUN: llvm-readobj %t1.x32.o --bb-addr-map 2>&1 | FileCheck -DADDR=0x11111 %s -DFILE=%t1.x32.o --check-prefix=CHECK
+# RUN: llvm-readobj %t1.x32.o --bb-addr-map 2>&1 | FileCheck --match-full-lines -DADDR=0x11111 %s -DFILE=%t1.x32.o --check-prefixes=CHECK,RAW
 # RUN: llvm-readelf %t1.x32.o --bb-addr-map | FileCheck %s --check-prefix=GNU
 
 ## Check that a malformed section can be handled.
 # RUN: yaml2obj %s -DBITS=32 -DSIZE=24 -o %t2.o
-# RUN: llvm-readobj %t2.o --bb-addr-map 2>&1 | FileCheck %s -DOFFSET=0x00000018 -DFILE=%t2.o --check-prefix=TRUNCATED
+# RUN: llvm-readobj %t2.o --bb-addr-map 2>&1 | FileCheck --match-full-lines %s -DOFFSET=0x00000018 -DFILE=%t2.o --check-prefix=TRUNCATED
 
 ## Check that missing features can be handled.
 # RUN: yaml2obj %s -DBITS=32 -DFEATURE=0x2 -o %t3.o
@@ -22,7 +24,7 @@
 # CHECK:      BBAddrMap [
 # CHECK-NEXT:   Function {
 # CHECK-NEXT:     At: [[ADDR]]
-# CHECK-NEXT: warning: '[[FILE]]': could not identify function symbol for address ([[ADDR]]) in SHT_LLVM_BB_ADDR_MAP section with index 3
+# CHECK-NEXT: {{.*}}: warning: '[[FILE]]': could not identify function symbol for address ([[ADDR]]) in SHT_LLVM_BB_ADDR_MAP section with index 3
 # CHECK-NEXT:     Name: <?>
 # CHECK-NEXT:     BB Ranges [
 # CHECK-NEXT:       {
@@ -55,16 +57,19 @@
 # CHECK-NEXT:       FuncEntryCount: 100
 # CHECK-NEXT:       PGO BB entries [
 # CHECK-NEXT:         {
-# CHECK-NEXT:           Frequency: 100
+# RAW-NEXT:             Frequency: 100
+# PRETTY-NEXT:          Frequency: 1.0
 # CHECK-NEXT:           Successors [
 # CHECK-NEXT:             {
 # CHECK-NEXT:               ID: 2
-# CHECK-NEXT:               Probability: 0xFFFFFFFF
+# RAW-NEXT:                 Probability: 0x80000000
+# PRETTY-NEXT:              Probability: 0x80000000 / 0x80000000 = 100.00%
 # CHECK-NEXT:             }
 # CHECK-NEXT:           ]
 # CHECK-NEXT:         }
 # CHECK-NEXT:         {
-# CHECK-NEXT:           Frequency: 100
+# RAW-NEXT:             Frequency: 100
+# PRETTY-NEXT:          Frequency: 1.0
 # CHECK-NEXT:           Successors [
 # CHECK-NEXT:           ]
 # CHECK-NEXT:         }
@@ -95,7 +100,8 @@
 # CHECK-NEXT:       FuncEntryCount: 8888
 # CHECK-NEXT:       PGO BB entries [
 # CHECK-NEXT:         {
-# CHECK-NEXT:           Frequency: 9000
+# RAW-NEXT:             Frequency: 9000
+# PRETTY-NEXT:          Frequency: 1.0
 # CHECK-NEXT:         }
 # CHECK-NEXT:       ]
 # CHECK-NEXT:     }
@@ -104,8 +110,10 @@
 
 # GNU: GNUStyle::printBBAddrMaps not implemented
 
+# PRETTY-NO-BAM: warning: --bb-addr-map must be enabled for --pretty-pgo-analysis-map to have an effect
+
 # TRUNCATED:      BBAddrMap [
-# TRUNCATED-NEXT:   warning: '[[FILE]]': unable to dump SHT_LLVM_BB_ADDR_MAP section with index 3: unable to decode LEB128 at offset [[OFFSET]]: malformed uleb128, extends past end
+# TRUNCATED-NEXT: {{.*}}: warning: '[[FILE]]': unable to dump SHT_LLVM_BB_ADDR_MAP section with index 3: unable to decode LEB128 at offset [[OFFSET]]: malformed uleb128, extends past end
 # TRUNCATED-NEXT: ]
 ## Check that the other valid section is properly dumped.
 # TRUNCATED-NEXT: BBAddrMap [
@@ -192,7 +200,7 @@ Sections:
           - BBFreq:        100
             Successors:
               - ID:        2
-                BrProb:    0xFFFFFFFF
+                BrProb:    0x80000000
           - BBFreq:        100
             Successors:    []
       - FuncEntryCount: 8888
diff --git a/llvm/tools/llvm-ar/llvm-ar.cpp b/llvm/tools/llvm-ar/llvm-ar.cpp
index c8800303bc1e..81cb2a21daf1 100644
--- a/llvm/tools/llvm-ar/llvm-ar.cpp
+++ b/llvm/tools/llvm-ar/llvm-ar.cpp
@@ -670,7 +670,7 @@ Expected<std::unique_ptr<Binary>> getAsBinary(const Archive::Child &C,
 }
 
 template <class A> static bool isValidInBitMode(const A &Member) {
-  if (object::Archive::getDefaultKindForHost() != object::Archive::K_AIXBIG)
+  if (object::Archive::getDefaultKind() != object::Archive::K_AIXBIG)
     return true;
   LLVMContext Context;
   Expected<std::unique_ptr<Binary>> BinOrErr = getAsBinary(Member, &Context);
@@ -1036,10 +1036,10 @@ static void performWriteOperation(ArchiveOperation Operation,
       }
     } else if (NewMembersP)
       Kind = !NewMembersP->empty() ? NewMembersP->front().detectKindFromObject()
-                                   : object::Archive::getDefaultKindForHost();
+                                   : object::Archive::getDefaultKind();
     else
       Kind = !NewMembers.empty() ? NewMembers.front().detectKindFromObject()
-                                 : object::Archive::getDefaultKindForHost();
+                                 : object::Archive::getDefaultKind();
     break;
   case GNU:
     Kind = object::Archive::K_GNU;
@@ -1331,7 +1331,7 @@ static int ar_main(int argc, char **argv) {
 
   // Get BitMode from enviorment variable "OBJECT_MODE" for AIX OS, if
   // specified.
-  if (object::Archive::getDefaultKindForHost() == object::Archive::K_AIXBIG) {
+  if (object::Archive::getDefaultKind() == object::Archive::K_AIXBIG) {
     BitMode = getBitMode(getenv("OBJECT_MODE"));
     if (BitMode == BitModeTy::Unknown)
       BitMode = BitModeTy::Bit32;
@@ -1392,8 +1392,7 @@ static int ar_main(int argc, char **argv) {
       continue;
 
     if (strncmp(*ArgIt, "-X", 2) == 0) {
-      if (object::Archive::getDefaultKindForHost() ==
-          object::Archive::K_AIXBIG) {
+      if (object::Archive::getDefaultKind() == object::Archive::K_AIXBIG) {
         Match = *(*ArgIt + 2) != '\0' ? *ArgIt + 2 : *(++ArgIt);
         BitMode = getBitMode(Match);
         if (BitMode == BitModeTy::Unknown)
@@ -1432,8 +1431,7 @@ static int ranlib_main(int argc, char **argv) {
           cl::PrintVersionMessage();
           return 0;
         } else if (arg.front() == 'X') {
-          if (object::Archive::getDefaultKindForHost() ==
-              object::Archive::K_AIXBIG) {
+          if (object::Archive::getDefaultKind() == object::Archive::K_AIXBIG) {
             HasAIXXOption = true;
             arg.consume_front("X");
             const char *Xarg = arg.data();
@@ -1464,7 +1462,7 @@ static int ranlib_main(int argc, char **argv) {
     }
   }
 
-  if (object::Archive::getDefaultKindForHost() == object::Archive::K_AIXBIG) {
+  if (object::Archive::getDefaultKind() == object::Archive::K_AIXBIG) {
     // If not specify -X option, get BitMode from enviorment variable
     // "OBJECT_MODE" for AIX OS if specify.
     if (!HasAIXXOption) {
diff --git a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index 2b438a8b1346..2bfc9705368e 100644
--- a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -286,6 +286,8 @@ static opt<bool> Verify("verify", desc("Verify the DWARF debug info."),
                         cat(DwarfDumpCategory));
 static opt<ErrorDetailLevel> ErrorDetails(
     "error-display", init(Unspecified),
+    desc("Set the level of detail and summary to display when verifying "
+         "(implies --verify)"),
     values(clEnumValN(NoDetailsOrSummary, "quiet",
                       "Only display whether errors occurred."),
            clEnumValN(NoDetailsOnlySummary, "summary",
@@ -295,6 +297,11 @@ static opt<ErrorDetailLevel> ErrorDetails(
            clEnumValN(BothDetailsAndSummary, "full",
                       "Display each error as well as a summary. [default]")),
     cat(DwarfDumpCategory));
+static opt<std::string> JsonErrSummaryFile(
+    "verify-json", init(""),
+    desc("Output JSON-formatted error summary to the specified file. "
+         "(Implies --verify)"),
+    value_desc("filename.json"), cat(DwarfDumpCategory));
 static opt<bool> Quiet("quiet", desc("Use with -verify to not emit to STDOUT."),
                        cat(DwarfDumpCategory));
 static opt<bool> DumpUUID("uuid", desc("Show the UUID for each architecture."),
@@ -349,6 +356,7 @@ static DIDumpOptions getDumpOpts(DWARFContext &C) {
                        ErrorDetails != NoDetailsOrSummary;
     DumpOpts.ShowAggregateErrors = ErrorDetails != OnlyDetailsNoSummary &&
                                    ErrorDetails != NoDetailsOnlySummary;
+    DumpOpts.JsonErrSummaryFile = JsonErrSummaryFile;
     return DumpOpts.noImplicitRecursion();
   }
   return DumpOpts;
@@ -834,8 +842,10 @@ int main(int argc, char **argv) {
                           "-verbose is currently not supported";
     return 1;
   }
-  if (!Verify && ErrorDetails != Unspecified)
-    WithColor::warning() << "-error-detail has no affect without -verify";
+  // -error-detail and -json-summary-file both imply -verify
+  if (ErrorDetails != Unspecified || !JsonErrSummaryFile.empty()) {
+    Verify = true;
+  }
 
   std::error_code EC;
   ToolOutputFile OutputFile(OutputFilename, EC, sys::fs::OF_TextWithCRLF);
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
index 0aecaaeea4b2..4ae6bc2a54cd 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
@@ -74,6 +74,8 @@ struct BenchmarkKey {
   // The address that the snippet should be loaded in at if the execution mode
   // being used supports it.
   intptr_t SnippetAddress = 0;
+  // The register that should be used to hold the loop counter.
+  unsigned LoopRegister;
 };
 
 struct BenchmarkMeasure {
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp b/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp
index 7258fcb4279c..431d99c72b80 100644
--- a/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp
@@ -9,6 +9,7 @@
 #include "SnippetFile.h"
 #include "BenchmarkRunner.h"
 #include "Error.h"
+#include "Target.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCObjectFileInfo.h"
@@ -175,6 +176,20 @@ public:
 
       return;
     }
+    if (CommentText.consume_front("LOOP-REGISTER")) {
+      // LLVM-EXEGESIS-LOOP-REGISTER <loop register>
+      unsigned LoopRegister;
+
+      if (!(LoopRegister = findRegisterByName(CommentText.trim()))) {
+        errs() << "unknown register '" << CommentText
+               << "' in 'LLVM-EXEGESIS-LOOP-REGISTER " << CommentText << "'\n";
+        ++InvalidComments;
+        return;
+      }
+
+      Result->Key.LoopRegister = LoopRegister;
+      return;
+    }
   }
 
   unsigned numInvalidComments() const { return InvalidComments; }
@@ -221,6 +236,11 @@ Expected<std::vector<BenchmarkCode>> readSnippets(const LLVMState &State,
 
   BenchmarkCode Result;
 
+  // Ensure that there is a default loop register value specified.
+  Result.Key.LoopRegister =
+      State.getExegesisTarget().getDefaultLoopCounterRegister(
+          State.getTargetMachine().getTargetTriple());
+
   const TargetMachine &TM = State.getTargetMachine();
   MCContext Context(TM.getTargetTriple(), TM.getMCAsmInfo(),
                     TM.getMCRegisterInfo(), TM.getMCSubtargetInfo());
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
index 561687a62319..0bab30d15820 100644
--- a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
@@ -48,10 +48,8 @@ public:
 
 class LoopSnippetRepetitor : public SnippetRepetitor {
 public:
-  explicit LoopSnippetRepetitor(const LLVMState &State)
-      : SnippetRepetitor(State),
-        LoopCounter(State.getExegesisTarget().getLoopCounterRegister(
-            State.getTargetMachine().getTargetTriple())) {}
+  explicit LoopSnippetRepetitor(const LLVMState &State, unsigned LoopRegister)
+      : SnippetRepetitor(State), LoopCounter(LoopRegister) {}
 
   // Loop over the snippet ceil(MinInstructions / Instructions.Size()) times.
   FillFunction Repeat(ArrayRef<MCInst> Instructions, unsigned MinInstructions,
@@ -113,8 +111,8 @@ public:
         (void)_;
         Loop.addInstructions(Instructions);
       }
-      ET.decrementLoopCounterAndJump(*Loop.MBB, *Loop.MBB,
-                                     State.getInstrInfo());
+      ET.decrementLoopCounterAndJump(*Loop.MBB, *Loop.MBB, State.getInstrInfo(),
+                                     LoopCounter);
 
       // Set up the exit basic block.
       Loop.MBB->addSuccessor(Exit.MBB, BranchProbability::getZero());
@@ -138,14 +136,14 @@ SnippetRepetitor::~SnippetRepetitor() {}
 
 std::unique_ptr<const SnippetRepetitor>
 SnippetRepetitor::Create(Benchmark::RepetitionModeE Mode,
-                         const LLVMState &State) {
+                         const LLVMState &State, unsigned LoopRegister) {
   switch (Mode) {
   case Benchmark::Duplicate:
   case Benchmark::MiddleHalfDuplicate:
     return std::make_unique<DuplicateSnippetRepetitor>(State);
   case Benchmark::Loop:
   case Benchmark::MiddleHalfLoop:
-    return std::make_unique<LoopSnippetRepetitor>(State);
+    return std::make_unique<LoopSnippetRepetitor>(State, LoopRegister);
   case Benchmark::AggregateMin:
     break;
   }
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h
index 2b3c416c9029..c62e80f161f1 100644
--- a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h
+++ b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h
@@ -29,7 +29,8 @@ namespace exegesis {
 class SnippetRepetitor {
 public:
   static std::unique_ptr<const SnippetRepetitor>
-  Create(Benchmark::RepetitionModeE Mode, const LLVMState &State);
+  Create(Benchmark::RepetitionModeE Mode, const LLVMState &State,
+         unsigned LoopRegister);
 
   virtual ~SnippetRepetitor();
 
diff --git a/llvm/tools/llvm-exegesis/lib/Target.h b/llvm/tools/llvm-exegesis/lib/Target.h
index 7bbd946b0333..522c75d15703 100644
--- a/llvm/tools/llvm-exegesis/lib/Target.h
+++ b/llvm/tools/llvm-exegesis/lib/Target.h
@@ -202,12 +202,15 @@ public:
   }
 
   // Returns a counter usable as a loop counter.
-  virtual unsigned getLoopCounterRegister(const Triple &) const { return 0; }
+  virtual unsigned getDefaultLoopCounterRegister(const Triple &) const {
+    return 0;
+  }
 
   // Adds the code to decrement the loop counter and
   virtual void decrementLoopCounterAndJump(MachineBasicBlock &MBB,
                                            MachineBasicBlock &TargetMBB,
-                                           const MCInstrInfo &MII) const {
+                                           const MCInstrInfo &MII,
+                                           unsigned LoopRegister) const {
     llvm_unreachable("decrementLoopCounterAndBranch() requires "
                      "getLoopCounterRegister() > 0");
   }
diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
index 6fc951a6e35d..a41a995f5560 100644
--- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -720,7 +720,7 @@ private:
 
   unsigned getScratchMemoryRegister(const Triple &TT) const override;
 
-  unsigned getLoopCounterRegister(const Triple &) const override;
+  unsigned getDefaultLoopCounterRegister(const Triple &) const override;
 
   unsigned getMaxMemoryAccessSize() const override { return 64; }
 
@@ -733,7 +733,8 @@ private:
 
   void decrementLoopCounterAndJump(MachineBasicBlock &MBB,
                                    MachineBasicBlock &TargetMBB,
-                                   const MCInstrInfo &MII) const override;
+                                   const MCInstrInfo &MII,
+                                   unsigned LoopRegister) const override;
 
   std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, unsigned Reg,
                                const APInt &Value) const override;
@@ -852,7 +853,7 @@ const unsigned ExegesisX86Target::kUnavailableRegistersSSE[12] = {
 // We're using one of R8-R15 because these registers are never hardcoded in
 // instructions (e.g. MOVS writes to EDI, ESI, EDX), so they have less
 // conflicts.
-constexpr const unsigned kLoopCounterReg = X86::R8;
+constexpr const unsigned kDefaultLoopCounterReg = X86::R8;
 
 } // namespace
 
@@ -870,11 +871,12 @@ unsigned ExegesisX86Target::getScratchMemoryRegister(const Triple &TT) const {
   return TT.isOSWindows() ? X86::RCX : X86::RDI;
 }
 
-unsigned ExegesisX86Target::getLoopCounterRegister(const Triple &TT) const {
+unsigned
+ExegesisX86Target::getDefaultLoopCounterRegister(const Triple &TT) const {
   if (!TT.isArch64Bit()) {
     return 0;
   }
-  return kLoopCounterReg;
+  return kDefaultLoopCounterReg;
 }
 
 Error ExegesisX86Target::randomizeTargetMCOperand(
@@ -912,10 +914,10 @@ void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT,
 
 void ExegesisX86Target::decrementLoopCounterAndJump(
     MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB,
-    const MCInstrInfo &MII) const {
+    const MCInstrInfo &MII, unsigned LoopRegister) const {
   BuildMI(&MBB, DebugLoc(), MII.get(X86::ADD64ri8))
-      .addDef(kLoopCounterReg)
-      .addUse(kLoopCounterReg)
+      .addDef(LoopRegister)
+      .addUse(LoopRegister)
       .addImm(-1);
   BuildMI(&MBB, DebugLoc(), MII.get(X86::JCC_1))
       .addMBB(&TargetMBB)
diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
index 782d44422791..1ae2565e894c 100644
--- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -497,22 +497,42 @@ void benchmarkMain() {
   }
 
   const auto Opcodes = getOpcodesOrDie(State);
+  std::vector<BenchmarkCode> Configurations;
+
+  unsigned LoopRegister =
+      State.getExegesisTarget().getDefaultLoopCounterRegister(
+          State.getTargetMachine().getTargetTriple());
+
+  if (Opcodes.empty()) {
+    Configurations = ExitOnErr(readSnippets(State, SnippetsFile));
+    for (const auto &Configuration : Configurations) {
+      if (ExecutionMode != BenchmarkRunner::ExecutionModeE::SubProcess &&
+          (Configuration.Key.MemoryMappings.size() != 0 ||
+           Configuration.Key.MemoryValues.size() != 0 ||
+           Configuration.Key.SnippetAddress != 0))
+        ExitWithError("Memory and snippet address annotations are only "
+                      "supported in subprocess "
+                      "execution mode");
+    }
+    LoopRegister = Configurations[0].Key.LoopRegister;
+  }
 
   SmallVector<std::unique_ptr<const SnippetRepetitor>, 2> Repetitors;
   if (RepetitionMode != Benchmark::RepetitionModeE::AggregateMin)
-    Repetitors.emplace_back(SnippetRepetitor::Create(RepetitionMode, State));
+    Repetitors.emplace_back(
+        SnippetRepetitor::Create(RepetitionMode, State, LoopRegister));
   else {
     for (Benchmark::RepetitionModeE RepMode :
          {Benchmark::RepetitionModeE::Duplicate,
           Benchmark::RepetitionModeE::Loop})
-      Repetitors.emplace_back(SnippetRepetitor::Create(RepMode, State));
+      Repetitors.emplace_back(
+          SnippetRepetitor::Create(RepMode, State, LoopRegister));
   }
 
   BitVector AllReservedRegs;
   for (const std::unique_ptr<const SnippetRepetitor> &Repetitor : Repetitors)
     AllReservedRegs |= Repetitor->getReservedRegs();
 
-  std::vector<BenchmarkCode> Configurations;
   if (!Opcodes.empty()) {
     for (const unsigned Opcode : Opcodes) {
       // Ignore instructions without a sched class if
@@ -534,17 +554,6 @@ void benchmarkMain() {
       std::move(ConfigsForInstr->begin(), ConfigsForInstr->end(),
                 std::back_inserter(Configurations));
     }
-  } else {
-    Configurations = ExitOnErr(readSnippets(State, SnippetsFile));
-    for (const auto &Configuration : Configurations) {
-      if (ExecutionMode != BenchmarkRunner::ExecutionModeE::SubProcess &&
-          (Configuration.Key.MemoryMappings.size() != 0 ||
-           Configuration.Key.MemoryValues.size() != 0 ||
-           Configuration.Key.SnippetAddress != 0))
-        ExitWithError("Memory and snippet address annotations are only "
-                      "supported in subprocess "
-                      "execution mode");
-    }
   }
 
   if (MinInstructions == 0) {
diff --git a/llvm/tools/llvm-objcopy/CommonOpts.td b/llvm/tools/llvm-objcopy/CommonOpts.td
index 4222532a1a38..c247c93f6e0f 100644
--- a/llvm/tools/llvm-objcopy/CommonOpts.td
+++ b/llvm/tools/llvm-objcopy/CommonOpts.td
@@ -7,6 +7,9 @@ multiclass Eq<string name, string help> {
                   HelpText<help>;
 }
 
+def grp_coff : OptionGroup<"kind">, HelpText<"OPTIONS (COFF specific)">;
+def grp_macho : OptionGroup<"kind">, HelpText<"OPTIONS (Mach-O specific)">;
+
 def help : Flag<["--"], "help">;
 def h : Flag<["-"], "h">, Alias<help>;
 
@@ -39,13 +42,13 @@ def p : Flag<["-"], "p">,
         HelpText<"Alias for --preserve-dates">;
 
 def strip_all : Flag<["--"], "strip-all">,
-                HelpText<"Remove non-allocated sections outside segments. "
-                         ".gnu.warning* and .ARM.attribute sections are not "
-                         "removed">;
+    HelpText<"For ELF, remove all symbols and non-alloc sections not within "
+             "segments, except for .gnu.warning*, .ARM.attribute, and the section name table. "
+             "For COFF and Mach-O, remove all symbols, debug sections, and relocations">;
 
 def strip_all_gnu
     : Flag<["--"], "strip-all-gnu">,
-      HelpText<"Compatible with GNU's --strip-all">;
+      HelpText<"Remove all symbols, debug sections and relocations. Compatible with GNU's --strip-all">;
 
 def strip_debug : Flag<["--"], "strip-debug">,
                   HelpText<"Remove all debug sections">;
@@ -64,7 +67,7 @@ def R : JoinedOrSeparate<["-"], "R">,
 
 def strip_sections
     : Flag<["--"], "strip-sections">,
-      HelpText<"Remove all section headers and all sections not in segments">;
+      HelpText<"Remove all section headers and all section data not within segments">;
 
 defm strip_symbol : Eq<"strip-symbol", "Strip <symbol>">,
                     MetaVarName<"symbol">;
@@ -75,17 +78,17 @@ def N : JoinedOrSeparate<["-"], "N">,
 defm keep_section : Eq<"keep-section", "Keep <section>">,
                     MetaVarName<"section">;
 
-defm keep_symbol : Eq<"keep-symbol", "Do not remove symbol <symbol>">,
+defm keep_symbol : Eq<"keep-symbol", "When removing symbols, do not remove <symbol>">,
                    MetaVarName<"symbol">;
 def K : JoinedOrSeparate<["-"], "K">,
         Alias<keep_symbol>,
         HelpText<"Alias for --keep-symbol">;
 
 def keep_file_symbols : Flag<["--"], "keep-file-symbols">,
-                        HelpText<"Do not remove file symbols">;
+                        HelpText<"Keep symbols of type STT_FILE, even if they would otherwise be stripped">;
 
 def keep_undefined : Flag<["--"], "keep-undefined">,
-                        HelpText<"Do not remove undefined symbols">;
+                        HelpText<"Do not remove undefined symbols">, Group<grp_macho>;
 
 def only_keep_debug
     : Flag<["--"], "only-keep-debug">,
@@ -94,16 +97,16 @@ def only_keep_debug
           "sections useful for debugging purposes">;
 
 def discard_locals : Flag<["--"], "discard-locals">,
-                     HelpText<"Remove compiler-generated local symbols, (e.g. "
-                              "symbols starting with .L)">;
+                     HelpText<"Remove local symbols starting with .L">;
 def X : Flag<["-"], "X">,
         Alias<discard_locals>,
         HelpText<"Alias for --discard-locals">;
 
 def discard_all
     : Flag<["--"], "discard-all">,
-      HelpText<"Remove all local symbols except file and section symbols. Also "
-               "remove all debug sections">;
+      HelpText<"Remove most local symbols. Different file formats may limit this to a subset. "
+      "For ELF, file and section symbols are not discarded. "
+      "Additionally, remove all debug sections">;
 def x : Flag<["-"], "x">,
         Alias<discard_all>,
         HelpText<"Alias for --discard-all">;
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
index ec9dc0a2a814..6318578b1100 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
@@ -254,6 +254,21 @@ parseSetSectionFlagValue(StringRef FlagValue) {
   return SFU;
 }
 
+static Expected<uint8_t> parseVisibilityType(StringRef VisType) {
+  const uint8_t Invalid = 0xff;
+  uint8_t type = StringSwitch<uint8_t>(VisType)
+                     .Case("default", ELF::STV_DEFAULT)
+                     .Case("hidden", ELF::STV_HIDDEN)
+                     .Case("internal", ELF::STV_INTERNAL)
+                     .Case("protected", ELF::STV_PROTECTED)
+                     .Default(Invalid);
+  if (type == Invalid)
+    return createStringError(errc::invalid_argument,
+                             "'%s' is not a valid symbol visibility",
+                             VisType.str().c_str());
+  return type;
+}
+
 namespace {
 struct TargetInfo {
   FileFormat Format;
@@ -969,6 +984,33 @@ objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
 
     Config.SymbolsToAdd.push_back(*SymInfo);
   }
+  for (auto *Arg : InputArgs.filtered(OBJCOPY_set_symbol_visibility)) {
+    if (!StringRef(Arg->getValue()).contains('='))
+      return createStringError(errc::invalid_argument,
+                               "bad format for --set-symbol-visibility");
+    auto [Sym, Visibility] = StringRef(Arg->getValue()).split('=');
+    Expected<uint8_t> Type = parseVisibilityType(Visibility);
+    if (!Type)
+      return Type.takeError();
+    ELFConfig.SymbolsToSetVisibility.emplace_back(NameMatcher(), *Type);
+    if (Error E = ELFConfig.SymbolsToSetVisibility.back().first.addMatcher(
+            NameOrPattern::create(Sym, SymbolMatchStyle, ErrorCallback)))
+      return std::move(E);
+  }
+  for (auto *Arg : InputArgs.filtered(OBJCOPY_set_symbols_visibility)) {
+    if (!StringRef(Arg->getValue()).contains('='))
+      return createStringError(errc::invalid_argument,
+                               "bad format for --set-symbols-visibility");
+    auto [File, Visibility] = StringRef(Arg->getValue()).split('=');
+    Expected<uint8_t> Type = parseVisibilityType(Visibility);
+    if (!Type)
+      return Type.takeError();
+    ELFConfig.SymbolsToSetVisibility.emplace_back(NameMatcher(), *Type);
+    if (Error E =
+            addSymbolsFromFile(ELFConfig.SymbolsToSetVisibility.back().first,
+                               DC.Alloc, File, SymbolMatchStyle, ErrorCallback))
+      return std::move(E);
+  }
 
   ELFConfig.AllowBrokenLinks = InputArgs.hasArg(OBJCOPY_allow_broken_links);
 
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOpts.td b/llvm/tools/llvm-objcopy/ObjcopyOpts.td
index bd041fabbdd7..3c0e5cd475a3 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOpts.td
+++ b/llvm/tools/llvm-objcopy/ObjcopyOpts.td
@@ -6,28 +6,25 @@ def B : JoinedOrSeparate<["-"], "B">,
         Alias<binary_architecture>,
         HelpText<"Alias for --binary-architecture">;
 
-defm target : Eq<"target", "Format of the input and output file">,
+defm target : Eq<"target", "Equivalent to --input-target and --output-target for the specified format">,
               Values<"binary">;
 def F : JoinedOrSeparate<["-"], "F">,
         Alias<target>,
         HelpText<"Alias for --target">;
 
-defm input_target : Eq<"input-target", "Format of the input file">,
-                    Values<"binary">;
+defm input_target : Eq<"input-target", "Read the input as the specified format">, MetaVarName<"format">;
 def I : JoinedOrSeparate<["-"], "I">,
         Alias<input_target>,
         HelpText<"Alias for --input-target">;
 
-defm output_target : Eq<"output-target", "Format of the output file">,
-                     Values<"binary">;
+defm output_target : Eq<"output-target", "Write the output as the specified format">, MetaVarName<"format">;
 def O : JoinedOrSeparate<["-"], "O">,
         Alias<output_target>,
         HelpText<"Alias for --output-target">;
 
-defm new_symbol_visibility : Eq<"new-symbol-visibility", "Visibility of "
-                                "symbols generated for binary input or added"
-                                " with --add-symbol unless otherwise"
-                                " specified. The default value is 'default'">;
+defm new_symbol_visibility
+    : Eq<"new-symbol-visibility", "Specify the visibility of symbols automatically "
+         "created when using binary input or --add-symbol. The default is 'default'">;
 
 def compress_debug_sections
     : Joined<["--"], "compress-debug-sections=">,
@@ -39,8 +36,8 @@ def : Flag<["--"], "compress-debug-sections">, Alias<compress_debug_sections>,
 def decompress_debug_sections : Flag<["--"], "decompress-debug-sections">,
                                 HelpText<"Decompress DWARF debug sections">;
 defm split_dwo
-    : Eq<"split-dwo", "Equivalent to extract-dwo on the input file to "
-                      "<dwo-file>, then strip-dwo on the input file">,
+    : Eq<"split-dwo", "Equivalent to --extract-dwo and <dwo-file> as the output file and no other options, "
+                      "and then --strip-dwo on the input file">,
       MetaVarName<"dwo-file">;
 
 defm add_gnu_debuglink
@@ -49,17 +46,15 @@ defm add_gnu_debuglink
 
 defm rename_section
     : Eq<"rename-section",
-         "Renames a section from old to new, optionally with specified flags. "
-         "Flags supported for GNU compatibility: alloc, load, noload, "
-         "readonly, exclude, debug, code, data, rom, share, contents, merge, "
-         "strings, large">,
+         "Rename sections called <old> to <new>, and apply any specified <flag> values. "
+         "See --set-section-flags for a list of supported flags">,
       MetaVarName<"old=new[,flag1,...]">;
 defm redefine_symbol
     : Eq<"redefine-sym", "Change the name of a symbol old to new">,
       MetaVarName<"old=new">;
 defm redefine_symbols
     : Eq<"redefine-syms",
-         "Reads a list of symbol pairs from <filename> and runs as if "
+         "Read a list of symbol pairs from <filename> and run as if "
          "--redefine-sym=<old>=<new> is set for each one. <filename> "
          "contains two symbols per line separated with whitespace and may "
          "contain comments beginning with '#'. Leading and trailing "
@@ -74,7 +69,7 @@ def j : JoinedOrSeparate<["-"], "j">,
         HelpText<"Alias for --only-section">;
 defm add_section
     : Eq<"add-section",
-         "Make a section named <section> with the contents of <file>">,
+         "Add a section named <section> with the contents of <file>">,
       MetaVarName<"section=file">;
 
 defm set_section_alignment
@@ -83,8 +78,8 @@ defm set_section_alignment
 
 defm set_section_flags
     : Eq<"set-section-flags",
-         "Set section flags for a given section. Flags supported for GNU "
-         "compatibility: alloc, load, noload, readonly, exclude, debug, code, "
+         "Set section properties based on the specified <flags>. Supported flag names are: "
+         "alloc, load, noload, readonly, exclude, debug, code, "
          "data, rom, share, contents, merge, strings, large">,
       MetaVarName<"section=flag1[,flag2,...]">;
 
@@ -93,28 +88,40 @@ defm set_section_type
          "Set the type of section <section> to the integer <type>">,
       MetaVarName<"section=type">;
 
+defm set_symbol_visibility
+    : Eq<"set-symbol-visibility",
+         "Change the visibility of a symbol to the specified value">,
+      MetaVarName<"symbol=visibility">;
+defm set_symbols_visibility
+    : Eq<"set-symbols-visibility",
+         "Read a list of symbols from <filename> and change their "
+         "visibility to the specified value. Visibility values: default, "
+         "internal, hidden, protected">,
+      MetaVarName<"filename=visibility">;
+
 def S : Flag<["-"], "S">,
         Alias<strip_all>,
         HelpText<"Alias for --strip-all">;
 def strip_dwo : Flag<["--"], "strip-dwo">,
-                HelpText<"Remove all DWARF .dwo sections from file">;
+                HelpText<"Remove all DWARF .dwo sections">;
 def strip_non_alloc
     : Flag<["--"], "strip-non-alloc">,
-      HelpText<"Remove all non-allocated sections outside segments">;
+      HelpText<"Remove all non-allocated sections that are not within segments">;
 defm strip_unneeded_symbol
     : Eq<"strip-unneeded-symbol",
-         "Remove symbol <symbol> if it is not needed by relocations">,
+         "Remove all symbols named <symbol> that are local or undefined and "
+         "are not required by any relocation">,
       MetaVarName<"symbol">;
 defm strip_unneeded_symbols
     : Eq<"strip-unneeded-symbols",
-         "Reads a list of symbols from <filename> and removes them "
-         "if they are not needed by relocations">,
+         "Remove all symbols whose names appear in the file <file>, if they "
+         "are local or undefined and are not required by any relocation">,
       MetaVarName<"filename">;
 
 defm subsystem
     : Eq<"subsystem",
-         "Set PE subsystem and version">,
-      MetaVarName<"name[:version]">;
+         "Set the PE subsystem, and optionally subsystem version">,
+      MetaVarName<"name[:version]">, Group<grp_coff>;
 
 def extract_dwo
     : Flag<["--"], "extract-dwo">,
@@ -132,11 +139,13 @@ def localize_hidden
     : Flag<["--"], "localize-hidden">,
       HelpText<
           "Mark all symbols that have hidden or internal visibility as local">;
-defm localize_symbol : Eq<"localize-symbol", "Mark <symbol> as local">,
-                       MetaVarName<"symbol">;
+defm localize_symbol
+    : Eq<"localize-symbol", "Mark any defined non-common symbol named <symbol> as local">,
+      MetaVarName<"symbol">;
 defm localize_symbols
     : Eq<"localize-symbols",
-         "Reads a list of symbols from <filename> and marks them local">,
+         "Read a list of names from <filename> and mark any defined non-common "
+         "symbols with those names as local">,
       MetaVarName<"filename">;
 
 def L : JoinedOrSeparate<["-"], "L">,
@@ -148,13 +157,14 @@ defm globalize_symbol : Eq<"globalize-symbol", "Mark <symbol> as global">,
 
 defm globalize_symbols
     : Eq<"globalize-symbols",
-         "Reads a list of symbols from <filename> and marks them global">,
+         "Read a list of symbols from <filename> and mark defined symbols"
+         " with those names as global">,
       MetaVarName<"filename">;
 
 defm keep_global_symbol
     : Eq<"keep-global-symbol",
-         "Convert all symbols except <symbol> to local. May be repeated to "
-         "convert all except a set of symbols to local">,
+         "Mark all symbols local, except for symbols with the name <symbol>. "
+         "Can be specified multiple times to ignore multiple symbols">,
       MetaVarName<"symbol">;
 def G : JoinedOrSeparate<["-"], "G">,
         Alias<keep_global_symbol>,
@@ -162,34 +172,34 @@ def G : JoinedOrSeparate<["-"], "G">,
 
 defm keep_global_symbols
     : Eq<"keep-global-symbols",
-         "Reads a list of symbols from <filename> and runs as if "
+         "Read a list of symbols from <filename> and run as if "
          "--keep-global-symbol=<symbol> is set for each one. <filename> "
          "contains one symbol per line and may contain comments beginning with "
          "'#'. Leading and trailing whitespace is stripped from each line. May "
          "be repeated to read symbols from many files">,
       MetaVarName<"filename">;
 
-defm weaken_symbol : Eq<"weaken-symbol", "Mark <symbol> as weak">,
+defm weaken_symbol : Eq<"weaken-symbol", "Mark global symbols named <symbol> as weak">,
                      MetaVarName<"symbol">;
 defm weaken_symbols
     : Eq<"weaken-symbols",
-         "Reads a list of symbols from <filename> and marks them weak">,
+         "Read a list of symbols from <filename> and mark global symbols with those names as weak">,
       MetaVarName<"filename">;
 
 def W : JoinedOrSeparate<["-"], "W">,
         Alias<weaken_symbol>,
         HelpText<"Alias for --weaken-symbol">;
 def weaken : Flag<["--"], "weaken">,
-             HelpText<"Mark all global symbols as weak">;
+             HelpText<"Mark all defined global symbols as weak">;
 
 defm strip_symbols
     : Eq<"strip-symbols",
-         "Reads a list of symbols from <filename> and removes them">,
+         "Remove all symbols whose names appear in the file <filename>">,
       MetaVarName<"filename">;
 
 defm keep_symbols
     : Eq<"keep-symbols",
-         "Reads a list of symbols from <filename> and runs as if "
+         "Read a list of symbols from <filename> and run as if "
          "--keep-symbol=<symbol> is set for each one. <filename> "
          "contains one symbol per line and may contain comments beginning with "
          "'#'. Leading and trailing whitespace is stripped from each line. May "
diff --git a/llvm/tools/llvm-objdump/ObjdumpOpts.td b/llvm/tools/llvm-objdump/ObjdumpOpts.td
index c1dec5ced89d..c3764c6e9753 100644
--- a/llvm/tools/llvm-objdump/ObjdumpOpts.td
+++ b/llvm/tools/llvm-objdump/ObjdumpOpts.td
@@ -210,6 +210,10 @@ def : Flag<["-"], "t">, Alias<syms>, HelpText<"Alias for --syms">;
 def symbolize_operands : Flag<["--"], "symbolize-operands">,
   HelpText<"Symbolize instruction operands when disassembling">;
 
+def pretty_pgo_analysis_map : Flag<["--"], "pretty-pgo-analysis-map">,
+                              HelpText<"Display PGO analysis values with "
+                                       "formatting rather than raw numbers">;
+
 def dynamic_syms : Flag<["--"], "dynamic-syms">,
   HelpText<"Display the contents of the dynamic symbol table">;
 def : Flag<["-"], "T">, Alias<dynamic_syms>,
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 948a5d74e1ab..78cf67b1e630 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -188,8 +188,10 @@ public:
   const BBAddrMap &getAddrMap() const { return AddrMap; }
 
   // Returns the PGO string associated with the entry of index `PGOBBEntryIndex`
-  // in `PGOMap`.
-  std::string constructPGOLabelString(size_t PGOBBEntryIndex) const {
+  // in `PGOMap`. If PrettyPGOAnalysis is true, prints BFI as relative frequency
+  // and BPI as percentage. Otherwise raw values are displayed.
+  std::string constructPGOLabelString(size_t PGOBBEntryIndex,
+                                      bool PrettyPGOAnalysis) const {
     if (!PGOMap.FeatEnable.hasPGOAnalysis())
       return "";
     std::string PGOString;
@@ -211,7 +213,12 @@ public:
           PGOMap.BBEntries[PGOBBEntryIndex];
 
       if (PGOMap.FeatEnable.BBFreq) {
-        PGOSS << "Frequency: " << Twine(PGOBBEntry.BlockFreq.getFrequency());
+        PGOSS << "Frequency: ";
+        if (PrettyPGOAnalysis)
+          printRelativeBlockFreq(PGOSS, PGOMap.BBEntries.front().BlockFreq,
+                                 PGOBBEntry.BlockFreq);
+        else
+          PGOSS << Twine(PGOBBEntry.BlockFreq.getFrequency());
         if (PGOMap.FeatEnable.BrProb && PGOBBEntry.Successors.size() > 0) {
           PGOSS << ", ";
         }
@@ -220,9 +227,12 @@ public:
         PGOSS << "Successors: ";
         interleaveComma(
             PGOBBEntry.Successors, PGOSS,
-            [&PGOSS](const PGOAnalysisMap::PGOBBEntry::SuccessorEntry &SE) {
+            [&](const PGOAnalysisMap::PGOBBEntry::SuccessorEntry &SE) {
               PGOSS << "BB" << SE.ID << ":";
-              PGOSS.write_hex(SE.Prob.getNumerator());
+              if (PrettyPGOAnalysis)
+                PGOSS << "[" << SE.Prob << "]";
+              else
+                PGOSS.write_hex(SE.Prob.getNumerator());
             });
       }
     }
@@ -331,6 +341,7 @@ static bool HasStopAddressFlag;
 
 bool objdump::SymbolTable;
 static bool SymbolizeOperands;
+static bool PrettyPGOAnalysisMap;
 static bool DynamicSymbolTable;
 std::string objdump::TripleName;
 bool objdump::UnwindInfo;
@@ -1410,8 +1421,8 @@ static void collectBBAddrMapLabels(
 
     std::string LabelString = ("BB" + Twine(BBEntry.ID)).str();
     Labels[BBAddress].push_back(
-        {LabelString,
-         FunctionMap->constructPGOLabelString(NumBBEntriesBeforeRange + I)});
+        {LabelString, FunctionMap->constructPGOLabelString(
+                          NumBBEntriesBeforeRange + I, PrettyPGOAnalysisMap)});
   }
 }
 
@@ -3473,6 +3484,10 @@ static void parseObjdumpOptions(const llvm::opt::InputArgList &InputArgs) {
   HasStopAddressFlag = InputArgs.hasArg(OBJDUMP_stop_address_EQ);
   SymbolTable = InputArgs.hasArg(OBJDUMP_syms);
   SymbolizeOperands = InputArgs.hasArg(OBJDUMP_symbolize_operands);
+  PrettyPGOAnalysisMap = InputArgs.hasArg(OBJDUMP_pretty_pgo_analysis_map);
+  if (PrettyPGOAnalysisMap && !SymbolizeOperands)
+    reportCmdLineWarning("--symbolize-operands must be enabled for "
+                         "--pretty-pgo-analysis-map to have an effect");
   DynamicSymbolTable = InputArgs.hasArg(OBJDUMP_dynamic_syms);
   TripleName = InputArgs.getLastArgValue(OBJDUMP_triple_EQ).str();
   UnwindInfo = InputArgs.hasArg(OBJDUMP_unwind_info);
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 4be678df4412..e78732353cc8 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -593,7 +593,7 @@ public:
   void printVersionDefinitionSection(const Elf_Shdr *Sec) override;
   void printVersionDependencySection(const Elf_Shdr *Sec) override;
   void printCGProfile() override;
-  void printBBAddrMaps() override;
+  void printBBAddrMaps(bool PrettyPGOAnalysis) override;
   void printAddrsig() override;
   void printNotes() override;
   void printELFLinkerOptions() override;
@@ -704,7 +704,7 @@ public:
   void printVersionDefinitionSection(const Elf_Shdr *Sec) override;
   void printVersionDependencySection(const Elf_Shdr *Sec) override;
   void printCGProfile() override;
-  void printBBAddrMaps() override;
+  void printBBAddrMaps(bool PrettyPGOAnalysis) override;
   void printAddrsig() override;
   void printNotes() override;
   void printELFLinkerOptions() override;
@@ -5036,7 +5036,8 @@ template <class ELFT> void GNUELFDumper<ELFT>::printCGProfile() {
   OS << "GNUStyle::printCGProfile not implemented\n";
 }
 
-template <class ELFT> void GNUELFDumper<ELFT>::printBBAddrMaps() {
+template <class ELFT>
+void GNUELFDumper<ELFT>::printBBAddrMaps(bool /*PrettyPGOAnalysis*/) {
   OS << "GNUStyle::printBBAddrMaps not implemented\n";
 }
 
@@ -7526,7 +7527,8 @@ template <class ELFT> void LLVMELFDumper<ELFT>::printCGProfile() {
   }
 }
 
-template <class ELFT> void LLVMELFDumper<ELFT>::printBBAddrMaps() {
+template <class ELFT>
+void LLVMELFDumper<ELFT>::printBBAddrMaps(bool PrettyPGOAnalysis) {
   bool IsRelocatable = this->Obj.getHeader().e_type == ELF::ET_REL;
   using Elf_Shdr = typename ELFT::Shdr;
   auto IsMatch = [](const Elf_Shdr &Sec) -> bool {
@@ -7605,21 +7607,28 @@ template <class ELFT> void LLVMELFDumper<ELFT>::printBBAddrMaps() {
           for (const PGOAnalysisMap::PGOBBEntry &PBBE : PAM.BBEntries) {
             DictScope L(W);
 
-            /// FIXME: currently we just emit the raw frequency, it may be
-            /// better to provide an option to scale it by the first entry
-            /// frequence using BlockFrequency::Scaled64 number
-            if (PAM.FeatEnable.BBFreq)
-              W.printNumber("Frequency", PBBE.BlockFreq.getFrequency());
+            if (PAM.FeatEnable.BBFreq) {
+              if (PrettyPGOAnalysis) {
+                std::string BlockFreqStr;
+                raw_string_ostream SS(BlockFreqStr);
+                printRelativeBlockFreq(SS, PAM.BBEntries.front().BlockFreq,
+                                       PBBE.BlockFreq);
+                W.printString("Frequency", BlockFreqStr);
+              } else {
+                W.printNumber("Frequency", PBBE.BlockFreq.getFrequency());
+              }
+            }
 
             if (PAM.FeatEnable.BrProb) {
               ListScope L(W, "Successors");
               for (const auto &Succ : PBBE.Successors) {
                 DictScope L(W);
                 W.printNumber("ID", Succ.ID);
-                /// FIXME: currently we just emit the raw numerator of the
-                /// probably, it may be better to provide an option to emit it
-                /// as a percentage or other prettied representation
-                W.printHex("Probability", Succ.Prob.getNumerator());
+                if (PrettyPGOAnalysis) {
+                  W.printObject("Probability", Succ.Prob);
+                } else {
+                  W.printHex("Probability", Succ.Prob.getNumerator());
+                }
               }
             }
           }
diff --git a/llvm/tools/llvm-readobj/ObjDumper.h b/llvm/tools/llvm-readobj/ObjDumper.h
index 3958dd3a3333..cd744e3bbfb7 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.h
+++ b/llvm/tools/llvm-readobj/ObjDumper.h
@@ -129,7 +129,9 @@ public:
   virtual void printGroupSections() {}
   virtual void printHashHistograms() {}
   virtual void printCGProfile() {}
-  virtual void printBBAddrMaps() {}
+  // If PrettyPGOAnalysis is true, prints BFI as relative frequency and BPI as
+  // percentage. Otherwise raw values are displayed.
+  virtual void printBBAddrMaps(bool PrettyPGOAnalysis) {}
   virtual void printAddrsig() {}
   virtual void printNotes() {}
   virtual void printELFLinkerOptions() {}
diff --git a/llvm/tools/llvm-readobj/Opts.td b/llvm/tools/llvm-readobj/Opts.td
index 018facc278e8..1e9cde6b2e87 100644
--- a/llvm/tools/llvm-readobj/Opts.td
+++ b/llvm/tools/llvm-readobj/Opts.td
@@ -19,6 +19,7 @@ def all : FF<"all", "Equivalent to setting: --file-header, --program-headers, --
              "--section-groups and --histogram">;
 def arch_specific : FF<"arch-specific", "Display architecture-specific information">;
 def bb_addr_map : FF<"bb-addr-map", "Display the BB address map section">;
+def pretty_pgo_analysis_map : FF<"pretty-pgo-analysis-map", "Display PGO analysis values with formatting rather than raw numbers">;
 def cg_profile : FF<"cg-profile", "Display call graph profile section">;
 def decompress : FF<"decompress", "Dump decompressed section content when used with -x or -p">;
 defm demangle : BB<"demangle", "Demangle symbol names", "Do not demangle symbol names (default)">;
diff --git a/llvm/tools/llvm-readobj/llvm-readobj.cpp b/llvm/tools/llvm-readobj/llvm-readobj.cpp
index 979433d69011..a0b576566016 100644
--- a/llvm/tools/llvm-readobj/llvm-readobj.cpp
+++ b/llvm/tools/llvm-readobj/llvm-readobj.cpp
@@ -95,6 +95,7 @@ static bool Addrsig;
 static bool All;
 static bool ArchSpecificInfo;
 static bool BBAddrMap;
+static bool PrettyPGOAnalysisMap;
 bool ExpandRelocs;
 static bool CGProfile;
 static bool Decompress;
@@ -212,6 +213,11 @@ static void parseOptions(const opt::InputArgList &Args) {
   opts::All = Args.hasArg(OPT_all);
   opts::ArchSpecificInfo = Args.hasArg(OPT_arch_specific);
   opts::BBAddrMap = Args.hasArg(OPT_bb_addr_map);
+  opts::PrettyPGOAnalysisMap = Args.hasArg(OPT_pretty_pgo_analysis_map);
+  if (opts::PrettyPGOAnalysisMap && !opts::BBAddrMap)
+    WithColor::warning(errs(), ToolName)
+        << "--bb-addr-map must be enabled for --pretty-pgo-analysis-map to "
+           "have an effect\n";
   opts::CGProfile = Args.hasArg(OPT_cg_profile);
   opts::Decompress = Args.hasArg(OPT_decompress);
   opts::Demangle = Args.hasFlag(OPT_demangle, OPT_no_demangle, false);
@@ -466,7 +472,7 @@ static void dumpObject(ObjectFile &Obj, ScopedPrinter &Writer,
     if (opts::CGProfile)
       Dumper->printCGProfile();
     if (opts::BBAddrMap)
-      Dumper->printBBAddrMaps();
+      Dumper->printBBAddrMaps(opts::PrettyPGOAnalysisMap);
     if (opts::Addrsig)
       Dumper->printAddrsig();
     if (opts::Notes)
diff --git a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp
index b58d7cd952af..69d9b9a2f784 100644
--- a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp
+++ b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp
@@ -71,10 +71,10 @@ dumpDXContainer(MemoryBufferRef Source) {
       break;
     }
     case dxbc::PartType::SFI0: {
-      std::optional<uint64_t> Flags = Container.getShaderFlags();
+      std::optional<uint64_t> Flags = Container.getShaderFeatureFlags();
       // Omit the flags in the YAML if they are missing or zero.
       if (Flags && *Flags > 0)
-        NewPart.Flags = DXContainerYAML::ShaderFlags(*Flags);
+        NewPart.Flags = DXContainerYAML::ShaderFeatureFlags(*Flags);
       break;
     }
     case dxbc::PartType::HASH: {
diff --git a/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp b/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
index 7426884217a0..1f2b8c1754f6 100644
--- a/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
@@ -110,12 +110,12 @@ TEST_F(SelectionDAGAddressAnalysisTest, sameFrameObject) {
   SDValue Index = DAG->getMemBasePlusOffset(FIPtr, Offset, Loc);
   SDValue Store = DAG->getStore(DAG->getEntryNode(), Loc, Value, Index,
                                 PtrInfo.getWithOffset(Offset));
-  std::optional<int64_t> NumBytes = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store)->getMemoryVT().getStoreSize());
+  TypeSize NumBytes = cast<StoreSDNode>(Store)->getMemoryVT().getStoreSize();
 
   bool IsAlias;
   bool IsValid = BaseIndexOffset::computeAliasing(
-      Store.getNode(), NumBytes, Store.getNode(), NumBytes, *DAG, IsAlias);
+      Store.getNode(), LocationSize::precise(NumBytes), Store.getNode(),
+      LocationSize::precise(NumBytes), *DAG, IsAlias);
 
   EXPECT_TRUE(IsValid);
   EXPECT_TRUE(IsAlias);
@@ -134,14 +134,10 @@ TEST_F(SelectionDAGAddressAnalysisTest, sameFrameObjectUnknownSize) {
   SDValue Store = DAG->getStore(DAG->getEntryNode(), Loc, Value, Index,
                                 PtrInfo.getWithOffset(Offset));
 
-  // Maybe unlikely that BaseIndexOffset::computeAliasing is used with the
-  // optional NumBytes being unset like in this test, but it would be confusing
-  // if that function determined IsAlias=false here.
-  std::optional<int64_t> NumBytes;
-
   bool IsAlias;
   bool IsValid = BaseIndexOffset::computeAliasing(
-      Store.getNode(), NumBytes, Store.getNode(), NumBytes, *DAG, IsAlias);
+      Store.getNode(), LocationSize::beforeOrAfterPointer(), Store.getNode(),
+      LocationSize::beforeOrAfterPointer(), *DAG, IsAlias);
 
   EXPECT_FALSE(IsValid);
 }
@@ -165,14 +161,13 @@ TEST_F(SelectionDAGAddressAnalysisTest, noAliasingFrameObjects) {
                                  PtrInfo.getWithOffset(Offset0));
   SDValue Store1 = DAG->getStore(DAG->getEntryNode(), Loc, Value, Index1,
                                  PtrInfo.getWithOffset(Offset1));
-  std::optional<int64_t> NumBytes0 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize());
-  std::optional<int64_t> NumBytes1 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize());
+  TypeSize NumBytes0 = cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize();
+  TypeSize NumBytes1 = cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize();
 
   bool IsAlias;
   bool IsValid = BaseIndexOffset::computeAliasing(
-      Store0.getNode(), NumBytes0, Store1.getNode(), NumBytes1, *DAG, IsAlias);
+      Store0.getNode(), LocationSize::precise(NumBytes0), Store1.getNode(),
+      LocationSize::precise(NumBytes1), *DAG, IsAlias);
 
   EXPECT_TRUE(IsValid);
   EXPECT_FALSE(IsAlias);
@@ -195,14 +190,13 @@ TEST_F(SelectionDAGAddressAnalysisTest, unknownSizeFrameObjects) {
       DAG->getStore(DAG->getEntryNode(), Loc, Value, FIPtr, PtrInfo);
   SDValue Store1 = DAG->getStore(DAG->getEntryNode(), Loc, Value, Index1,
                                  MachinePointerInfo(PtrInfo.getAddrSpace()));
-  std::optional<int64_t> NumBytes0 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize());
-  std::optional<int64_t> NumBytes1 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize());
+  TypeSize NumBytes0 = cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize();
+  TypeSize NumBytes1 = cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize();
 
   bool IsAlias;
   bool IsValid = BaseIndexOffset::computeAliasing(
-      Store0.getNode(), NumBytes0, Store1.getNode(), NumBytes1, *DAG, IsAlias);
+      Store0.getNode(), LocationSize::precise(NumBytes0), Store1.getNode(),
+      LocationSize::precise(NumBytes1), *DAG, IsAlias);
 
   EXPECT_FALSE(IsValid);
 }
@@ -220,20 +214,19 @@ TEST_F(SelectionDAGAddressAnalysisTest, globalWithFrameObject) {
   SDValue Index = DAG->getMemBasePlusOffset(FIPtr, Offset, Loc);
   SDValue Store = DAG->getStore(DAG->getEntryNode(), Loc, Value, Index,
                                 PtrInfo.getWithOffset(Offset));
-  std::optional<int64_t> NumBytes = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store)->getMemoryVT().getStoreSize());
+  TypeSize NumBytes = cast<StoreSDNode>(Store)->getMemoryVT().getStoreSize();
   EVT GTy = DAG->getTargetLoweringInfo().getValueType(DAG->getDataLayout(),
                                                       G->getType());
   SDValue GValue = DAG->getConstant(0, Loc, GTy);
   SDValue GAddr = DAG->getGlobalAddress(G, Loc, GTy);
   SDValue GStore = DAG->getStore(DAG->getEntryNode(), Loc, GValue, GAddr,
                                  MachinePointerInfo(G, 0));
-  std::optional<int64_t> GNumBytes = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(GStore)->getMemoryVT().getStoreSize());
+  TypeSize GNumBytes = cast<StoreSDNode>(GStore)->getMemoryVT().getStoreSize();
 
   bool IsAlias;
   bool IsValid = BaseIndexOffset::computeAliasing(
-      Store.getNode(), NumBytes, GStore.getNode(), GNumBytes, *DAG, IsAlias);
+      Store.getNode(), LocationSize::precise(NumBytes), GStore.getNode(),
+      LocationSize::precise(GNumBytes), *DAG, IsAlias);
 
   EXPECT_TRUE(IsValid);
   EXPECT_FALSE(IsAlias);
@@ -248,8 +241,7 @@ TEST_F(SelectionDAGAddressAnalysisTest, globalWithAliasedGlobal) {
   SDValue GAddr = DAG->getGlobalAddress(G, Loc, GTy);
   SDValue GStore = DAG->getStore(DAG->getEntryNode(), Loc, GValue, GAddr,
                                  MachinePointerInfo(G, 0));
-  std::optional<int64_t> GNumBytes = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(GStore)->getMemoryVT().getStoreSize());
+  TypeSize GNumBytes = cast<StoreSDNode>(GStore)->getMemoryVT().getStoreSize();
 
   SDValue AliasedGValue = DAG->getConstant(1, Loc, GTy);
   SDValue AliasedGAddr = DAG->getGlobalAddress(AliasedG, Loc, GTy);
@@ -258,9 +250,9 @@ TEST_F(SelectionDAGAddressAnalysisTest, globalWithAliasedGlobal) {
                     MachinePointerInfo(AliasedG, 0));
 
   bool IsAlias;
-  bool IsValid = BaseIndexOffset::computeAliasing(GStore.getNode(), GNumBytes,
-                                                  AliasedGStore.getNode(),
-                                                  GNumBytes, *DAG, IsAlias);
+  bool IsValid = BaseIndexOffset::computeAliasing(
+      GStore.getNode(), LocationSize::precise(GNumBytes),
+      AliasedGStore.getNode(), LocationSize::precise(GNumBytes), *DAG, IsAlias);
 
   // With some deeper analysis we could detect if G and AliasedG is aliasing or
   // not. But computeAliasing is currently defensive and assumes that a
@@ -290,19 +282,19 @@ TEST_F(SelectionDAGAddressAnalysisTest, fixedSizeFrameObjectsWithinDiff) {
                                  PtrInfo.getWithOffset(Offset0));
   SDValue Store1 = DAG->getStore(DAG->getEntryNode(), Loc, Value1, Index1,
                                  PtrInfo.getWithOffset(Offset1));
-  std::optional<int64_t> NumBytes0 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize());
-  std::optional<int64_t> NumBytes1 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize());
+  TypeSize NumBytes0 = cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize();
+  TypeSize NumBytes1 = cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize();
 
   bool IsAlias;
   bool IsValid = BaseIndexOffset::computeAliasing(
-      Store0.getNode(), NumBytes0, Store1.getNode(), NumBytes1, *DAG, IsAlias);
+      Store0.getNode(), LocationSize::precise(NumBytes0), Store1.getNode(),
+      LocationSize::precise(NumBytes1), *DAG, IsAlias);
   EXPECT_TRUE(IsValid);
   EXPECT_FALSE(IsAlias);
 
   IsValid = BaseIndexOffset::computeAliasing(
-      Store1.getNode(), NumBytes1, Store0.getNode(), NumBytes0, *DAG, IsAlias);
+      Store1.getNode(), LocationSize::precise(NumBytes1), Store0.getNode(),
+      LocationSize::precise(NumBytes0), *DAG, IsAlias);
   EXPECT_TRUE(IsValid);
   EXPECT_FALSE(IsAlias);
 }
@@ -331,14 +323,13 @@ TEST_F(SelectionDAGAddressAnalysisTest, fixedSizeFrameObjectsOutOfDiff) {
                                  PtrInfo.getWithOffset(Offset0));
   SDValue Store1 = DAG->getStore(DAG->getEntryNode(), Loc, Value1, Index1,
                                  PtrInfo.getWithOffset(Offset1));
-  std::optional<int64_t> NumBytes0 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize());
-  std::optional<int64_t> NumBytes1 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize());
+  TypeSize NumBytes0 = cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize();
+  TypeSize NumBytes1 = cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize();
 
   bool IsAlias;
   bool IsValid = BaseIndexOffset::computeAliasing(
-      Store0.getNode(), NumBytes0, Store1.getNode(), NumBytes1, *DAG, IsAlias);
+      Store0.getNode(), LocationSize::precise(NumBytes0), Store1.getNode(),
+      LocationSize::precise(NumBytes1), *DAG, IsAlias);
   EXPECT_TRUE(IsValid);
   EXPECT_TRUE(IsAlias);
 }
@@ -365,14 +356,13 @@ TEST_F(SelectionDAGAddressAnalysisTest, twoFixedStackObjects) {
                                  PtrInfo0.getWithOffset(Offset0));
   SDValue Store1 = DAG->getStore(DAG->getEntryNode(), Loc, Value1, Index1,
                                  PtrInfo1.getWithOffset(Offset0));
-  std::optional<int64_t> NumBytes0 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize());
-  std::optional<int64_t> NumBytes1 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize());
+  TypeSize NumBytes0 = cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize();
+  TypeSize NumBytes1 = cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize();
 
   bool IsAlias;
   bool IsValid = BaseIndexOffset::computeAliasing(
-      Store0.getNode(), NumBytes0, Store1.getNode(), NumBytes1, *DAG, IsAlias);
+      Store0.getNode(), LocationSize::precise(NumBytes0), Store1.getNode(),
+      LocationSize::precise(NumBytes1), *DAG, IsAlias);
   EXPECT_TRUE(IsValid);
   EXPECT_FALSE(IsAlias);
 }
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index d923b25fda9f..fdbe8df783b1 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -3327,8 +3327,8 @@ TEST_F(OpenMPIRBuilderTest, SingleDirective) {
     EXPECT_NE(IPBB->end(), IP.getPoint());
   };
 
-  Builder.restoreIP(OMPBuilder.createSingle(
-      Builder, BodyGenCB, FiniCB, /*IsNowait*/ false, /*DidIt*/ nullptr));
+  Builder.restoreIP(
+      OMPBuilder.createSingle(Builder, BodyGenCB, FiniCB, /*IsNowait*/ false));
   Value *EntryBBTI = EntryBB->getTerminator();
   EXPECT_NE(EntryBBTI, nullptr);
   EXPECT_TRUE(isa<BranchInst>(EntryBBTI));
@@ -3417,8 +3417,8 @@ TEST_F(OpenMPIRBuilderTest, SingleDirectiveNowait) {
     EXPECT_NE(IPBB->end(), IP.getPoint());
   };
 
-  Builder.restoreIP(OMPBuilder.createSingle(
-      Builder, BodyGenCB, FiniCB, /*IsNowait*/ true, /*DidIt*/ nullptr));
+  Builder.restoreIP(
+      OMPBuilder.createSingle(Builder, BodyGenCB, FiniCB, /*IsNowait*/ true));
   Value *EntryBBTI = EntryBB->getTerminator();
   EXPECT_NE(EntryBBTI, nullptr);
   EXPECT_TRUE(isa<BranchInst>(EntryBBTI));
@@ -3464,6 +3464,151 @@ TEST_F(OpenMPIRBuilderTest, SingleDirectiveNowait) {
   EXPECT_EQ(ExitBarrier, nullptr);
 }
 
+// Helper class to check each instruction of a BB.
+class BBInstIter {
+  BasicBlock *BB;
+  BasicBlock::iterator BBI;
+
+public:
+  BBInstIter(BasicBlock *BB) : BB(BB), BBI(BB->begin()) {}
+
+  bool hasNext() const { return BBI != BB->end(); }
+
+  template <typename InstTy> InstTy *next() {
+    if (!hasNext())
+      return nullptr;
+    Instruction *Cur = &*BBI++;
+    if (!isa<InstTy>(Cur))
+      return nullptr;
+    return cast<InstTy>(Cur);
+  }
+};
+
+TEST_F(OpenMPIRBuilderTest, SingleDirectiveCopyPrivate) {
+  using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+  OpenMPIRBuilder OMPBuilder(*M);
+  OMPBuilder.initialize();
+  F->setName("func");
+  IRBuilder<> Builder(BB);
+
+  OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
+
+  AllocaInst *PrivAI = nullptr;
+
+  BasicBlock *EntryBB = nullptr;
+  BasicBlock *ThenBB = nullptr;
+
+  Value *CPVar = Builder.CreateAlloca(F->arg_begin()->getType());
+  Builder.CreateStore(F->arg_begin(), CPVar);
+
+  FunctionType *CopyFuncTy = FunctionType::get(
+      Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getPtrTy()}, false);
+  Function *CopyFunc =
+      Function::Create(CopyFuncTy, Function::PrivateLinkage, "copy_var", *M);
+
+  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+    if (AllocaIP.isSet())
+      Builder.restoreIP(AllocaIP);
+    else
+      Builder.SetInsertPoint(&*(F->getEntryBlock().getFirstInsertionPt()));
+    PrivAI = Builder.CreateAlloca(F->arg_begin()->getType());
+    Builder.CreateStore(F->arg_begin(), PrivAI);
+
+    llvm::BasicBlock *CodeGenIPBB = CodeGenIP.getBlock();
+    llvm::Instruction *CodeGenIPInst = &*CodeGenIP.getPoint();
+    EXPECT_EQ(CodeGenIPBB->getTerminator(), CodeGenIPInst);
+
+    Builder.restoreIP(CodeGenIP);
+
+    // collect some info for checks later
+    ThenBB = Builder.GetInsertBlock();
+    EntryBB = ThenBB->getUniquePredecessor();
+
+    // simple instructions for body
+    Value *PrivLoad =
+        Builder.CreateLoad(PrivAI->getAllocatedType(), PrivAI, "local.use");
+    Builder.CreateICmpNE(F->arg_begin(), PrivLoad);
+  };
+
+  auto FiniCB = [&](InsertPointTy IP) {
+    BasicBlock *IPBB = IP.getBlock();
+    // IP must be before the unconditional branch to ExitBB
+    EXPECT_NE(IPBB->end(), IP.getPoint());
+  };
+
+  Builder.restoreIP(OMPBuilder.createSingle(Builder, BodyGenCB, FiniCB,
+                                            /*IsNowait*/ false, {CPVar},
+                                            {CopyFunc}));
+  Value *EntryBBTI = EntryBB->getTerminator();
+  EXPECT_NE(EntryBBTI, nullptr);
+  EXPECT_TRUE(isa<BranchInst>(EntryBBTI));
+  BranchInst *EntryBr = cast<BranchInst>(EntryBB->getTerminator());
+  EXPECT_TRUE(EntryBr->isConditional());
+  EXPECT_EQ(EntryBr->getSuccessor(0), ThenBB);
+  BasicBlock *ExitBB = ThenBB->getUniqueSuccessor();
+  EXPECT_EQ(EntryBr->getSuccessor(1), ExitBB);
+
+  CmpInst *CondInst = cast<CmpInst>(EntryBr->getCondition());
+  EXPECT_TRUE(isa<CallInst>(CondInst->getOperand(0)));
+
+  CallInst *SingleEntryCI = cast<CallInst>(CondInst->getOperand(0));
+  EXPECT_EQ(SingleEntryCI->arg_size(), 2U);
+  EXPECT_EQ(SingleEntryCI->getCalledFunction()->getName(), "__kmpc_single");
+  EXPECT_TRUE(isa<GlobalVariable>(SingleEntryCI->getArgOperand(0)));
+
+  // check ThenBB
+  BBInstIter ThenBBI(ThenBB);
+  // load PrivAI
+  auto *PrivLI = ThenBBI.next<LoadInst>();
+  EXPECT_NE(PrivLI, nullptr);
+  EXPECT_EQ(PrivLI->getPointerOperand(), PrivAI);
+  // icmp
+  EXPECT_TRUE(ThenBBI.next<ICmpInst>());
+  // store 1, DidIt
+  auto *DidItSI = ThenBBI.next<StoreInst>();
+  EXPECT_NE(DidItSI, nullptr);
+  EXPECT_EQ(DidItSI->getValueOperand(),
+            ConstantInt::get(Type::getInt32Ty(Ctx), 1));
+  Value *DidIt = DidItSI->getPointerOperand();
+  // call __kmpc_end_single
+  auto *SingleEndCI = ThenBBI.next<CallInst>();
+  EXPECT_NE(SingleEndCI, nullptr);
+  EXPECT_EQ(SingleEndCI->getCalledFunction()->getName(), "__kmpc_end_single");
+  EXPECT_EQ(SingleEndCI->arg_size(), 2U);
+  EXPECT_TRUE(isa<GlobalVariable>(SingleEndCI->getArgOperand(0)));
+  EXPECT_EQ(SingleEndCI->getArgOperand(1), SingleEntryCI->getArgOperand(1));
+  // br ExitBB
+  auto *ExitBBBI = ThenBBI.next<BranchInst>();
+  EXPECT_NE(ExitBBBI, nullptr);
+  EXPECT_TRUE(ExitBBBI->isUnconditional());
+  EXPECT_EQ(ExitBBBI->getOperand(0), ExitBB);
+  EXPECT_FALSE(ThenBBI.hasNext());
+
+  // check ExitBB
+  BBInstIter ExitBBI(ExitBB);
+  // call __kmpc_global_thread_num
+  auto *ThreadNumCI = ExitBBI.next<CallInst>();
+  EXPECT_NE(ThreadNumCI, nullptr);
+  EXPECT_EQ(ThreadNumCI->getCalledFunction()->getName(),
+            "__kmpc_global_thread_num");
+  // load DidIt
+  auto *DidItLI = ExitBBI.next<LoadInst>();
+  EXPECT_NE(DidItLI, nullptr);
+  EXPECT_EQ(DidItLI->getPointerOperand(), DidIt);
+  // call __kmpc_copyprivate
+  auto *CopyPrivateCI = ExitBBI.next<CallInst>();
+  EXPECT_NE(CopyPrivateCI, nullptr);
+  EXPECT_EQ(CopyPrivateCI->arg_size(), 6U);
+  EXPECT_TRUE(isa<AllocaInst>(CopyPrivateCI->getArgOperand(3)));
+  EXPECT_EQ(CopyPrivateCI->getArgOperand(3), CPVar);
+  EXPECT_TRUE(isa<Function>(CopyPrivateCI->getArgOperand(4)));
+  EXPECT_EQ(CopyPrivateCI->getArgOperand(4), CopyFunc);
+  EXPECT_TRUE(isa<LoadInst>(CopyPrivateCI->getArgOperand(5)));
+  DidItLI = cast<LoadInst>(CopyPrivateCI->getArgOperand(5));
+  EXPECT_EQ(DidItLI->getOperand(0), DidIt);
+  EXPECT_FALSE(ExitBBI.hasNext());
+}
+
 TEST_F(OpenMPIRBuilderTest, OMPAtomicReadFlt) {
   OpenMPIRBuilder OMPBuilder(*M);
   OMPBuilder.initialize();
diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp
index 65db93e537f7..c99f928de8a9 100644
--- a/llvm/unittests/IR/DebugInfoTest.cpp
+++ b/llvm/unittests/IR/DebugInfoTest.cpp
@@ -1082,7 +1082,10 @@ TEST(MetadataTest, DPValueConversionRoutines) {
   EXPECT_FALSE(BB1->IsNewDbgInfoFormat);
   // Validating the block for DPValues / DPMarkers shouldn't fail -- there's
   // no data stored right now.
-  EXPECT_FALSE(BB1->validateDbgValues(false, false));
+  bool BrokenDebugInfo = false;
+  bool Error = verifyModule(*M, &errs(), &BrokenDebugInfo);
+  EXPECT_FALSE(Error);
+  EXPECT_FALSE(BrokenDebugInfo);
 
   // Function and module should be marked as not having the new format too.
   EXPECT_FALSE(F->IsNewDbgInfoFormat);
@@ -1135,13 +1138,17 @@ TEST(MetadataTest, DPValueConversionRoutines) {
     EXPECT_TRUE(!Inst.DbgMarker || Inst.DbgMarker->StoredDPValues.empty());
 
   // Validating the first block should continue to not be a problem,
-  EXPECT_FALSE(BB1->validateDbgValues(false, false));
+  Error = verifyModule(*M, &errs(), &BrokenDebugInfo);
+  EXPECT_FALSE(Error);
+  EXPECT_FALSE(BrokenDebugInfo);
   // But if we were to break something, it should be able to fire. Don't attempt
   // to comprehensively test the validator, it's a smoke-test rather than a
   // "proper" verification pass.
   DPV1->setMarker(nullptr);
   // A marker pointing the wrong way should be an error.
-  EXPECT_TRUE(BB1->validateDbgValues(false, false));
+  Error = verifyModule(*M, &errs(), &BrokenDebugInfo);
+  EXPECT_FALSE(Error);
+  EXPECT_TRUE(BrokenDebugInfo);
   DPV1->setMarker(FirstInst->DbgMarker);
 
   DILocalVariable *DLV1 = DPV1->getVariable();
diff --git a/llvm/unittests/Support/TypeSizeTest.cpp b/llvm/unittests/Support/TypeSizeTest.cpp
index 34fe376989e7..b02b7e600953 100644
--- a/llvm/unittests/Support/TypeSizeTest.cpp
+++ b/llvm/unittests/Support/TypeSizeTest.cpp
@@ -81,7 +81,6 @@ static_assert(INT64_C(2) * TSFixed32 == TypeSize::getFixed(64));
 static_assert(UINT64_C(2) * TSFixed32 == TypeSize::getFixed(64));
 static_assert(alignTo(TypeSize::getFixed(7), 8) == TypeSize::getFixed(8));
 
-static_assert(TypeSize() == TypeSize::getFixed(0));
 static_assert(TypeSize::getZero() == TypeSize::getFixed(0));
 static_assert(TypeSize::getZero() != TypeSize::getScalable(0));
 static_assert(TypeSize::getFixed(0) != TypeSize::getScalable(0));
diff --git a/llvm/unittests/Target/ARM/MachineInstrTest.cpp b/llvm/unittests/Target/ARM/MachineInstrTest.cpp
index aeb25bf012d0..3a76054ca4f3 100644
--- a/llvm/unittests/Target/ARM/MachineInstrTest.cpp
+++ b/llvm/unittests/Target/ARM/MachineInstrTest.cpp
@@ -1126,7 +1126,9 @@ TEST(MachineInstr, HasSideEffects) {
       VLDR_VPR_post,
       VLDR_VPR_pre,
       VLLDM,
+      VLLDM_T2,
       VLSTM,
+      VLSTM_T2,
       VMRS,
       VMRS_FPCXTNS,
       VMRS_FPCXTS,
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SnippetFileTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SnippetFileTest.cpp
index 505a030675f6..f1fa89117117 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/SnippetFileTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SnippetFileTest.cpp
@@ -219,6 +219,25 @@ TEST_F(X86SnippetFileTest, SnippetAddress) {
   EXPECT_EQ(Snippet.Key.SnippetAddress, 0x10000);
 }
 
+TEST_F(X86SnippetFileTest, LoopRegister) {
+  auto Snippets = TestCommon(R"(
+    # LLVM-EXEGESIS-LOOP-REGISTER R11
+  )");
+  ASSERT_TRUE(static_cast<bool>(Snippets));
+  EXPECT_THAT(*Snippets, SizeIs(1));
+  const auto &Snippet = (*Snippets)[0];
+  EXPECT_EQ(Snippet.Key.LoopRegister, X86::R11);
+}
+
+TEST_F(X86SnippetFileTest, LoopRegisterInvalidRegister) {
+  auto Error = TestCommon(R"(
+    # LLVM-EXEGESIS-LOOP-REGISTER INVALID
+  )")
+                   .takeError();
+  EXPECT_TRUE(static_cast<bool>(Error));
+  consumeError(std::move(Error));
+}
+
 } // namespace
 } // namespace exegesis
 } // namespace llvm
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
index 25e8836087c1..b55ca5057ae0 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
@@ -40,7 +40,10 @@ protected:
 
   void TestCommon(Benchmark::RepetitionModeE RepetitionMode,
                   unsigned SnippetInstructions = 1) {
-    const auto Repetitor = SnippetRepetitor::Create(RepetitionMode, State);
+    const auto Repetitor = SnippetRepetitor::Create(
+        RepetitionMode, State,
+        State.getExegesisTarget().getDefaultLoopCounterRegister(
+            State.getTargetMachine().getTargetTriple()));
     const std::vector<MCInst> Instructions(SnippetInstructions,
                                            MCInstBuilder(X86::NOOP));
     FunctionFiller Sink(*MF, {X86::EAX});
@@ -98,11 +101,12 @@ TEST_F(X86SnippetRepetitorTest, Loop) {
                           HasOpcode(X86::NOOP), HasOpcode(X86::NOOP),
                           HasOpcode(X86::NOOP), HasOpcode(X86::ADD64ri8),
                           HasOpcode(X86::JCC_1)));
-  EXPECT_THAT(LoopBlock.liveins(),
-              UnorderedElementsAre(
-                  LiveReg(X86::EAX),
-                  LiveReg(State.getExegesisTarget().getLoopCounterRegister(
-                      State.getTargetMachine().getTargetTriple()))));
+  EXPECT_THAT(
+      LoopBlock.liveins(),
+      UnorderedElementsAre(
+          LiveReg(X86::EAX),
+          LiveReg(State.getExegesisTarget().getDefaultLoopCounterRegister(
+              State.getTargetMachine().getTargetTriple()))));
   EXPECT_THAT(MF->getBlockNumbered(2)->instrs(),
               ElementsAre(HasOpcode(X86::RET64)));
 }
diff --git a/llvm/utils/TableGen/CodeEmitterGen.cpp b/llvm/utils/TableGen/CodeEmitterGen.cpp
index d80761d5fe35..1e80eb6b1ad5 100644
--- a/llvm/utils/TableGen/CodeEmitterGen.cpp
+++ b/llvm/utils/TableGen/CodeEmitterGen.cpp
@@ -365,8 +365,8 @@ void CodeEmitterGen::emitInstructionBaseValues(
   if (HwMode == -1)
     o << "  static const uint64_t InstBits[] = {\n";
   else
-    o << "  static const uint64_t InstBits_" << HWM.getMode(HwMode).Name
-      << "[] = {\n";
+    o << "  static const uint64_t InstBits_"
+      << HWM.getModeName(HwMode, /*IncludeDefault=*/true) << "[] = {\n";
 
   for (const CodeGenInstruction *CGI : NumberedInstructions) {
     Record *R = CGI->TheDef;
@@ -495,8 +495,8 @@ void CodeEmitterGen::run(raw_ostream &o) {
       o << "  switch (HwMode) {\n";
       o << "  default: llvm_unreachable(\"Unknown hardware mode!\"); break;\n";
       for (unsigned I : HwModes) {
-        o << "  case " << I << ": InstBits = InstBits_" << HWM.getMode(I).Name
-          << "; break;\n";
+        o << "  case " << I << ": InstBits = InstBits_"
+          << HWM.getModeName(I, /*IncludeDefault=*/true) << "; break;\n";
       }
       o << "  };\n";
     }
diff --git a/llvm/utils/TableGen/CodeGenHwModes.h b/llvm/utils/TableGen/CodeGenHwModes.h
index 56639f741ede..23723b7bd4af 100644
--- a/llvm/utils/TableGen/CodeGenHwModes.h
+++ b/llvm/utils/TableGen/CodeGenHwModes.h
@@ -52,6 +52,11 @@ struct CodeGenHwModes {
     assert(Id != 0 && "Mode id of 0 is reserved for the default mode");
     return Modes[Id - 1];
   }
+  StringRef getModeName(unsigned Id, bool IncludeDefault = false) const {
+    if (IncludeDefault && Id == CodeGenHwModes::DefaultMode)
+      return DefaultModeName;
+    return getMode(Id).Name;
+  }
   const HwModeSelect &getHwModeSelect(Record *R) const;
   const std::map<Record *, HwModeSelect> &getHwModeSelects() const {
     return ModeSelects;
diff --git a/llvm/utils/TableGen/CodeGenInstruction.h b/llvm/utils/TableGen/CodeGenInstruction.h
index 11a3acd8e723..963c9f0b2592 100644
--- a/llvm/utils/TableGen/CodeGenInstruction.h
+++ b/llvm/utils/TableGen/CodeGenInstruction.h
@@ -301,7 +301,7 @@ public:
   Record *InferredFrom;
 
   // The enum value assigned by CodeGenTarget::computeInstrsByEnum.
-  mutable unsigned EnumVal;
+  mutable unsigned EnumVal = 0;
 
   CodeGenInstruction(Record *R);
 
diff --git a/llvm/utils/TableGen/CodeGenSchedule.cpp b/llvm/utils/TableGen/CodeGenSchedule.cpp
index b4c624703626..d819016f8b56 100644
--- a/llvm/utils/TableGen/CodeGenSchedule.cpp
+++ b/llvm/utils/TableGen/CodeGenSchedule.cpp
@@ -2190,6 +2190,15 @@ void CodeGenSchedModels::addWriteRes(Record *ProcWriteResDef, unsigned PIdx) {
 // Add resources for a ReadAdvance to this processor if they don't exist.
 void CodeGenSchedModels::addReadAdvance(Record *ProcReadAdvanceDef,
                                         unsigned PIdx) {
+  for (const Record *ValidWrite :
+       ProcReadAdvanceDef->getValueAsListOfDefs("ValidWrites"))
+    if (getSchedRWIdx(ValidWrite, /*IsRead=*/false) == 0)
+      PrintFatalError(
+          ProcReadAdvanceDef->getLoc(),
+          "ReadAdvance referencing a ValidWrite that is not used by "
+          "any instruction (" +
+              ValidWrite->getName() + ")");
+
   RecVec &RADefs = ProcModels[PIdx].ReadAdvanceDefs;
   if (is_contained(RADefs, ProcReadAdvanceDef))
     return;
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 4ce5a73d7756..27ff84bce405 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -2461,8 +2461,9 @@ collectHwModesReferencedForEncodings(const CodeGenHwModes &HWM,
         BV.set(P.first);
     }
   }
-  transform(BV.set_bits(), std::back_inserter(Names),
-            [&HWM](const int &M) { return HWM.getMode(M).Name; });
+  transform(BV.set_bits(), std::back_inserter(Names), [&HWM](const int &M) {
+    return HWM.getModeName(M, /*IncludeDefault=*/true);
+  });
 }
 
 // Emits disassembler code for instruction decoding.
@@ -2503,8 +2504,9 @@ void DecoderEmitter::run(raw_ostream &o) {
       if (DefInit *DI = dyn_cast_or_null<DefInit>(RV->getValue())) {
         EncodingInfoByHwMode EBM(DI->getDef(), HWM);
         for (auto &KV : EBM)
-          NumberedEncodings.emplace_back(KV.second, NumberedInstruction,
-                                         HWM.getMode(KV.first).Name);
+          NumberedEncodings.emplace_back(
+              KV.second, NumberedInstruction,
+              HWM.getModeName(KV.first, /*IncludeDefault=*/true));
         continue;
       }
     }
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index 2707f54eed6e..d350d7de139f 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -1262,7 +1262,10 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
         WriteIDs.push_back(0);
       else {
         for (Record *VW : ValidWrites) {
-          WriteIDs.push_back(SchedModels.getSchedRWIdx(VW, /*IsRead=*/false));
+          unsigned WriteID = SchedModels.getSchedRWIdx(VW, /*IsRead=*/false);
+          assert(WriteID != 0 &&
+                 "Expected a valid SchedRW in the list of ValidWrites");
+          WriteIDs.push_back(WriteID);
         }
       }
       llvm::sort(WriteIDs);
diff --git a/llvm/utils/gn/secondary/bolt/lib/Core/BUILD.gn b/llvm/utils/gn/secondary/bolt/lib/Core/BUILD.gn
index eeabaffc7d00..210dd1278509 100644
--- a/llvm/utils/gn/secondary/bolt/lib/Core/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/lib/Core/BUILD.gn
@@ -25,6 +25,7 @@ static_library("Core") {
     "BinarySection.cpp",
     "DIEBuilder.cpp",
     "DebugData.cpp",
+    "DebugNames.cpp",
     "DynoStats.cpp",
     "Exceptions.cpp",
     "FunctionLayout.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn
index fbff113613d2..5e533bf23ec4 100644
--- a/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn
@@ -8,6 +8,8 @@ static_library("InstallAPI") {
   ]
   sources = [
     "FileList.cpp",
+    "Frontend.cpp",
     "HeaderFile.cpp",
+    "Visitor.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
index 747ca8f9c91d..b966b7484267 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
@@ -75,7 +75,6 @@ static_library("LLVMHexagonCodeGen") {
     "HexagonOptAddrMode.cpp",
     "HexagonOptimizeSZextends.cpp",
     "HexagonPeephole.cpp",
-    "HexagonPostIncOpt.cpp",
     "HexagonRDFOpt.cpp",
     "HexagonRegisterInfo.cpp",
     "HexagonSelectionDAGInfo.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn
index 949b3b214740..a8d6290f1b99 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn
@@ -61,6 +61,7 @@ static_library("LLVMWebAssemblyCodeGen") {
     "WebAssemblyOptimizeLiveIntervals.cpp",
     "WebAssemblyOptimizeReturned.cpp",
     "WebAssemblyPeephole.cpp",
+    "WebAssemblyRefTypeMem2Local.cpp",
     "WebAssemblyRegColoring.cpp",
     "WebAssemblyRegNumbering.cpp",
     "WebAssemblyRegStackify.cpp",
diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index 16c898bdeb6e..070609c94a3b 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -111,8 +111,6 @@ if ("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
 else()
   set(MLIR_ENABLE_CUDA_CONVERSIONS 0)
 endif()
-# TODO: we should use a config.h file like LLVM does
-add_definitions(-DMLIR_CUDA_CONVERSIONS_ENABLED=${MLIR_ENABLE_CUDA_CONVERSIONS})
 
 # Build the ROCm conversions and run according tests if the AMDGPU backend
 # is available.
diff --git a/mlir/docs/Dialects/GPU.md b/mlir/docs/Dialects/GPU.md
index 85255fdc5e64..8a3acc33600a 100644
--- a/mlir/docs/Dialects/GPU.md
+++ b/mlir/docs/Dialects/GPU.md
@@ -50,6 +50,7 @@ An example of how the compilation workflow look is:
 ```
 mlir-opt example.mlir                   \
   --pass-pipeline="builtin.module(      \
+    gpu-kernel-outlining,               \ # Outline gpu.launch body to a kernel.
     nvvm-attach-target{chip=sm_90 O=3}, \ # Attach an NVVM target to a gpu.module op.
     gpu.module(convert-gpu-to-nvvm),    \ # Convert GPU to NVVM.
     gpu-to-llvm,                        \ # Convert GPU to LLVM.
diff --git a/mlir/docs/PassManagement.md b/mlir/docs/PassManagement.md
index ff86bbfef7b0..c9d705f0506a 100644
--- a/mlir/docs/PassManagement.md
+++ b/mlir/docs/PassManagement.md
@@ -56,8 +56,7 @@ By default, an operation pass is `op-agnostic`, meaning that it operates on the
 operation type of the pass manager that it is added to. This means a pass may operate
 on many different types of operations. Agnostic passes should be written such that
 they do not make assumptions on the operation they run on. Examples of this type of pass are
-[Canonicalization](Pass.md/-canonicalize-canonicalize-operations)
-[Common Sub-Expression Elimination](Passes.md/#-cse-eliminate-common-sub-expressions).
+[Canonicalization](Passes.md/#-canonicalize) and [Common Sub-Expression Elimination](Passes.md/#-cse).
 
 To create an agnostic operation pass, a derived class must adhere to the following:
 
diff --git a/mlir/docs/PatternRewriter.md b/mlir/docs/PatternRewriter.md
index 011cd1417563..0ba76199874c 100644
--- a/mlir/docs/PatternRewriter.md
+++ b/mlir/docs/PatternRewriter.md
@@ -366,7 +366,7 @@ Note: This driver listens for IR changes via the callbacks provided by
 rewriter and do not bypass the rewriter API by modifying ops directly.
 
 Note: This driver is the one used by the [canonicalization](Canonicalization.md)
-[pass](Passes.md/#-canonicalize-canonicalize-operations) in MLIR.
+[pass](Passes.md/#-canonicalize) in MLIR.
 
 ### Debugging
 
diff --git a/mlir/examples/CMakeLists.txt b/mlir/examples/CMakeLists.txt
index d256bf1a5cbb..2a1cac34d8c2 100644
--- a/mlir/examples/CMakeLists.txt
+++ b/mlir/examples/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_subdirectory(toy)
 add_subdirectory(transform)
+add_subdirectory(transform-opt)
 add_subdirectory(minimal-opt)
diff --git a/mlir/examples/transform-opt/CMakeLists.txt b/mlir/examples/transform-opt/CMakeLists.txt
new file mode 100644
index 000000000000..8e23555d0b5d
--- /dev/null
+++ b/mlir/examples/transform-opt/CMakeLists.txt
@@ -0,0 +1,26 @@
+get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
+get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
+get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS)
+
+set(LIBS
+  MLIRAnalysis
+  MLIRIR
+  MLIRParser
+  MLIRSupport
+  MLIRTransformDialect
+  MLIRTransformDialectTransforms
+  MLIRTransforms
+  ${dialect_libs}
+  ${conversion_libs}
+  ${extension_libs}
+)
+
+add_mlir_tool(mlir-transform-opt
+  mlir-transform-opt.cpp
+
+  DEPENDS
+  ${LIBS}
+)
+target_link_libraries(mlir-transform-opt PRIVATE ${LIBS})
+llvm_update_compile_flags(mlir-transform-opt)
+mlir_check_all_link_libraries(mlir-transform-opt)
diff --git a/mlir/examples/transform-opt/README.md b/mlir/examples/transform-opt/README.md
new file mode 100644
index 000000000000..e9c8cc0173c7
--- /dev/null
+++ b/mlir/examples/transform-opt/README.md
@@ -0,0 +1,40 @@
+# Standalone Transform Dialect Interpreter
+
+This is an example of using the Transform dialect interpreter functionality standalone, that is, outside of the regular pass pipeline. The example is a
+binary capable of processing MLIR source files similar to `mlir-opt` and other
+optimizer drivers, with the entire transformation process driven by a Transform
+dialect script. This script can be embedded into the source file or provided in
+a separate MLIR source file.
+
+Either the input module or the transform module must contain a top-level symbol
+named `__transform_main`, which is used as the entry point to the transformation
+script.
+
+```sh
+mlir-transform-opt payload_with_embedded_transform.mlir
+mlir-transform-opt payload.mlir -transform=transform.mlir
+```
+
+The name of the entry point can be overridden using command-line options.
+
+```sh
+mlir-transform-opt payload-mlir -transform-entry-point=another_entry_point
+```
+
+Transform scripts can reference symbols defined in other source files, called
+libraries, which can be supplied to the binary through command-line options.
+Libraries will be embedded into the main transformation module by the tool and
+the interpreter will process everything as a single module. A debug option is
+available to see the contents of the transform module before it goes into the interpreter.
+
+```sh
+mlir-transform-opt payload.mlir -transform=transform.mlir \
+  -transform-library=external_definitions_1.mlir \
+  -transform-library=external_definitions_2.mlir \
+  -dump-library-module
+```
+
+Check out the [Transform dialect
+tutorial](https://mlir.llvm.org/docs/Tutorials/transform/) as well as
+[documentation](https://mlir.llvm.org/docs/Dialects/Transform/) to learn more
+about the dialect. 
diff --git a/mlir/examples/transform-opt/mlir-transform-opt.cpp b/mlir/examples/transform-opt/mlir-transform-opt.cpp
new file mode 100644
index 000000000000..41a17f18726b
--- /dev/null
+++ b/mlir/examples/transform-opt/mlir-transform-opt.cpp
@@ -0,0 +1,389 @@
+//===- mlir-transform-opt.cpp -----------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/IR/Utils.h"
+#include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h"
+#include "mlir/IR/AsmState.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/InitAllDialects.h"
+#include "mlir/InitAllExtensions.h"
+#include "mlir/InitAllPasses.h"
+#include "mlir/Parser/Parser.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include <cstdlib>
+
+namespace {
+
+using namespace llvm;
+
+/// Structure containing command line options for the tool, these will get
+/// initialized when an instance is created.
+struct MlirTransformOptCLOptions {
+  cl::opt<bool> allowUnregisteredDialects{
+      "allow-unregistered-dialect",
+      cl::desc("Allow operations coming from an unregistered dialect"),
+      cl::init(false)};
+
+  cl::opt<bool> verifyDiagnostics{
+      "verify-diagnostics",
+      cl::desc("Check that emitted diagnostics match expected-* lines "
+               "on the corresponding line"),
+      cl::init(false)};
+
+  cl::opt<std::string> payloadFilename{cl::Positional, cl::desc("<input file>"),
+                                       cl::init("-")};
+
+  cl::opt<std::string> outputFilename{"o", cl::desc("Output filename"),
+                                      cl::value_desc("filename"),
+                                      cl::init("-")};
+
+  cl::opt<std::string> transformMainFilename{
+      "transform",
+      cl::desc("File containing entry point of the transform script, if "
+               "different from the input file"),
+      cl::value_desc("filename"), cl::init("")};
+
+  cl::list<std::string> transformLibraryFilenames{
+      "transform-library", cl::desc("File(s) containing definitions of "
+                                    "additional transform script symbols")};
+
+  cl::opt<std::string> transformEntryPoint{
+      "transform-entry-point",
+      cl::desc("Name of the entry point transform symbol"),
+      cl::init(mlir::transform::TransformDialect::kTransformEntryPointSymbolName
+                   .str())};
+
+  cl::opt<bool> disableExpensiveChecks{
+      "disable-expensive-checks",
+      cl::desc("Disables potentially expensive checks in the transform "
+               "interpreter, providing more speed at the expense of "
+               "potential memory problems and silent corruptions"),
+      cl::init(false)};
+
+  cl::opt<bool> dumpLibraryModule{
+      "dump-library-module",
+      cl::desc("Prints the combined library module before the output"),
+      cl::init(false)};
+};
+} // namespace
+
+/// "Managed" static instance of the command-line options structure. This makes
+/// them locally-scoped and explicitly initialized/deinitialized. While this is
+/// not strictly necessary in the tool source file that is not being used as a
+/// library (where the options would pollute the global list of options), it is
+/// good practice to follow this.
+static llvm::ManagedStatic<MlirTransformOptCLOptions> clOptions;
+
+/// Explicitly registers command-line options.
+static void registerCLOptions() { *clOptions; }
+
+namespace {
+/// A wrapper class for source managers diagnostic. This provides both unique
+/// ownership and virtual function-like overload for a pair of
+/// inheritance-related classes that do not use virtual functions.
+class DiagnosticHandlerWrapper {
+public:
+  /// Kind of the diagnostic handler to use.
+  enum class Kind { EmitDiagnostics, VerifyDiagnostics };
+
+  /// Constructs the diagnostic handler of the specified kind of the given
+  /// source manager and context.
+  DiagnosticHandlerWrapper(Kind kind, llvm::SourceMgr &mgr,
+                           mlir::MLIRContext *context) {
+    if (kind == Kind::EmitDiagnostics)
+      handler = new mlir::SourceMgrDiagnosticHandler(mgr, context);
+    else
+      handler = new mlir::SourceMgrDiagnosticVerifierHandler(mgr, context);
+  }
+
+  /// This object is non-copyable but movable.
+  DiagnosticHandlerWrapper(const DiagnosticHandlerWrapper &) = delete;
+  DiagnosticHandlerWrapper(DiagnosticHandlerWrapper &&other) = default;
+  DiagnosticHandlerWrapper &
+  operator=(const DiagnosticHandlerWrapper &) = delete;
+  DiagnosticHandlerWrapper &operator=(DiagnosticHandlerWrapper &&) = default;
+
+  /// Verifies the captured "expected-*" diagnostics if required.
+  mlir::LogicalResult verify() const {
+    if (auto *ptr =
+            handler.dyn_cast<mlir::SourceMgrDiagnosticVerifierHandler *>()) {
+      return ptr->verify();
+    }
+    return mlir::success();
+  }
+
+  /// Destructs the object of the same type as allocated.
+  ~DiagnosticHandlerWrapper() {
+    if (auto *ptr = handler.dyn_cast<mlir::SourceMgrDiagnosticHandler *>()) {
+      delete ptr;
+    } else {
+      delete handler.get<mlir::SourceMgrDiagnosticVerifierHandler *>();
+    }
+  }
+
+private:
+  /// Internal storage is a type-safe union.
+  llvm::PointerUnion<mlir::SourceMgrDiagnosticHandler *,
+                     mlir::SourceMgrDiagnosticVerifierHandler *>
+      handler;
+};
+
+/// MLIR has deeply rooted expectations that the LLVM source manager contains
+/// exactly one buffer, until at least the lexer level. This class wraps
+/// multiple LLVM source managers each managing a buffer to match MLIR's
+/// expectations while still providing a centralized handling mechanism.
+class TransformSourceMgr {
+public:
+  /// Constructs the source manager indicating whether diagnostic messages will
+  /// be verified later on.
+  explicit TransformSourceMgr(bool verifyDiagnostics)
+      : verifyDiagnostics(verifyDiagnostics) {}
+
+  /// Deconstructs the source manager. Note that `checkResults` must have been
+  /// called on this instance before deconstructing it.
+  ~TransformSourceMgr() {
+    assert(resultChecked && "must check the result of diagnostic handlers by "
+                            "running TransformSourceMgr::checkResult");
+  }
+
+  /// Parses the given buffer and creates the top-level operation of the kind
+  /// specified as template argument in the given context. Additional parsing
+  /// options may be provided.
+  template <typename OpTy = mlir::Operation *>
+  mlir::OwningOpRef<OpTy> parseBuffer(std::unique_ptr<MemoryBuffer> buffer,
+                                      mlir::MLIRContext &context,
+                                      const mlir::ParserConfig &config) {
+    // Create a single-buffer LLVM source manager. Note that `unique_ptr` allows
+    // the code below to capture a reference to the source manager in such a way
+    // that it is not invalidated when the vector contents is eventually
+    // reallocated.
+    llvm::SourceMgr &mgr =
+        *sourceMgrs.emplace_back(std::make_unique<llvm::SourceMgr>());
+    mgr.AddNewSourceBuffer(std::move(buffer), llvm::SMLoc());
+
+    // Choose the type of diagnostic handler depending on whether diagnostic
+    // verification needs to happen and store it.
+    if (verifyDiagnostics) {
+      diagHandlers.emplace_back(
+          DiagnosticHandlerWrapper::Kind::VerifyDiagnostics, mgr, &context);
+    } else {
+      diagHandlers.emplace_back(DiagnosticHandlerWrapper::Kind::EmitDiagnostics,
+                                mgr, &context);
+    }
+
+    // Defer to MLIR's parser.
+    return mlir::parseSourceFile<OpTy>(mgr, config);
+  }
+
+  /// If diagnostic message verification has been requested upon construction of
+  /// this source manager, performs the verification, reports errors and returns
+  /// the result of the verification. Otherwise passes through the given value.
+  mlir::LogicalResult checkResult(mlir::LogicalResult result) {
+    resultChecked = true;
+    if (!verifyDiagnostics)
+      return result;
+
+    return mlir::failure(llvm::any_of(diagHandlers, [](const auto &handler) {
+      return mlir::failed(handler.verify());
+    }));
+  }
+
+private:
+  /// Indicates whether diagnostic message verification is requested.
+  const bool verifyDiagnostics;
+
+  /// Indicates that diagnostic message verification has taken place, and the
+  /// deconstruction is therefore safe.
+  bool resultChecked = false;
+
+  /// Storage for per-buffer source managers and diagnostic handlers. These are
+  /// wrapped into unique pointers in order to make it safe to capture
+  /// references to these objects: if the vector is reallocated, the unique
+  /// pointer objects are moved by the pointer addresses won't change. Also, for
+  /// handlers, this allows to store the pointer to the base class.
+  SmallVector<std::unique_ptr<llvm::SourceMgr>> sourceMgrs;
+  SmallVector<DiagnosticHandlerWrapper> diagHandlers;
+};
+} // namespace
+
+/// Trivial wrapper around `applyTransforms` that doesn't support extra mapping
+/// and doesn't enforce the entry point transform ops being top-level.
+static mlir::LogicalResult
+applyTransforms(mlir::Operation *payloadRoot,
+                mlir::transform::TransformOpInterface transformRoot,
+                const mlir::transform::TransformOptions &options) {
+  return applyTransforms(payloadRoot, transformRoot, {}, options,
+                         /*enforceToplevelTransformOp=*/false);
+}
+
+/// Applies transforms indicated in the transform dialect script to the input
+/// buffer. The transform script may be embedded in the input buffer or as a
+/// separate buffer. The transform script may have external symbols, the
+/// definitions of which must be provided in transform library buffers. If the
+/// application is successful, prints the transformed input buffer into the
+/// given output stream. Additional configuration options are derived from
+/// command-line options.
+static mlir::LogicalResult processPayloadBuffer(
+    raw_ostream &os, std::unique_ptr<MemoryBuffer> inputBuffer,
+    std::unique_ptr<llvm::MemoryBuffer> transformBuffer,
+    MutableArrayRef<std::unique_ptr<MemoryBuffer>> transformLibraries,
+    mlir::DialectRegistry &registry) {
+
+  // Initialize the MLIR context, and various configurations.
+  mlir::MLIRContext context(registry, mlir::MLIRContext::Threading::DISABLED);
+  context.allowUnregisteredDialects(clOptions->allowUnregisteredDialects);
+  mlir::ParserConfig config(&context);
+  TransformSourceMgr sourceMgr(
+      /*verifyDiagnostics=*/clOptions->verifyDiagnostics);
+
+  // Parse the input buffer that will be used as transform payload.
+  mlir::OwningOpRef<mlir::Operation *> payloadRoot =
+      sourceMgr.parseBuffer(std::move(inputBuffer), context, config);
+  if (!payloadRoot)
+    return sourceMgr.checkResult(mlir::failure());
+
+  // Identify the module containing the transform script entry point. This may
+  // be the same module as the input or a separate module. In the former case,
+  // make a copy of the module so it can be modified freely. Modification may
+  // happen in the script itself (at which point it could be rewriting itself
+  // during interpretation, leading to tricky memory errors) or by embedding
+  // library modules in the script.
+  mlir::OwningOpRef<mlir::ModuleOp> transformRoot;
+  if (transformBuffer) {
+    transformRoot = sourceMgr.parseBuffer<mlir::ModuleOp>(
+        std::move(transformBuffer), context, config);
+    if (!transformRoot)
+      return sourceMgr.checkResult(mlir::failure());
+  } else {
+    transformRoot = cast<mlir::ModuleOp>(payloadRoot->clone());
+  }
+
+  // Parse and merge the libraries into the main transform module.
+  for (auto &&transformLibrary : transformLibraries) {
+    mlir::OwningOpRef<mlir::ModuleOp> libraryModule =
+        sourceMgr.parseBuffer<mlir::ModuleOp>(std::move(transformLibrary),
+                                              context, config);
+
+    if (!libraryModule ||
+        mlir::failed(mlir::transform::detail::mergeSymbolsInto(
+            *transformRoot, std::move(libraryModule))))
+      return sourceMgr.checkResult(mlir::failure());
+  }
+
+  // If requested, dump the combined transform module.
+  if (clOptions->dumpLibraryModule)
+    transformRoot->dump();
+
+  // Find the entry point symbol. Even if it had originally been in the payload
+  // module, it was cloned into the transform module so only look there.
+  mlir::transform::TransformOpInterface entryPoint =
+      mlir::transform::detail::findTransformEntryPoint(
+          *transformRoot, mlir::ModuleOp(), clOptions->transformEntryPoint);
+  if (!entryPoint)
+    return sourceMgr.checkResult(mlir::failure());
+
+  // Apply the requested transformations.
+  mlir::transform::TransformOptions transformOptions;
+  transformOptions.enableExpensiveChecks(!clOptions->disableExpensiveChecks);
+  if (mlir::failed(applyTransforms(*payloadRoot, entryPoint, transformOptions)))
+    return sourceMgr.checkResult(mlir::failure());
+
+  // Print the transformed result and check the captured diagnostics if
+  // requested.
+  payloadRoot->print(os);
+  return sourceMgr.checkResult(mlir::success());
+}
+
+/// Tool entry point.
+static mlir::LogicalResult runMain(int argc, char **argv) {
+  // Register all upstream dialects and extensions. Specific uses are advised
+  // not to register all dialects indiscriminately but rather hand-pick what is
+  // necessary for their use case.
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::registerAllExtensions(registry);
+  mlir::registerAllPasses();
+
+  // Explicitly register the transform dialect. This is not strictly necessary
+  // since it has been already registered as part of the upstream dialect list,
+  // but useful for example purposes for cases when dialects to register are
+  // hand-picked. The transform dialect must be registered.
+  registry.insert<mlir::transform::TransformDialect>();
+
+  // Register various command-line options. Note that the LLVM initializer
+  // object is a RAII that ensures correct deconstruction of command-line option
+  // objects inside ManagedStatic.
+  llvm::InitLLVM y(argc, argv);
+  mlir::registerAsmPrinterCLOptions();
+  mlir::registerMLIRContextCLOptions();
+  registerCLOptions();
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Minimal Transform dialect driver\n");
+
+  // Try opening the main input file.
+  std::string errorMessage;
+  std::unique_ptr<llvm::MemoryBuffer> payloadFile =
+      mlir::openInputFile(clOptions->payloadFilename, &errorMessage);
+  if (!payloadFile) {
+    llvm::errs() << errorMessage << "\n";
+    return mlir::failure();
+  }
+
+  // Try opening the output file.
+  std::unique_ptr<llvm::ToolOutputFile> outputFile =
+      mlir::openOutputFile(clOptions->outputFilename, &errorMessage);
+  if (!outputFile) {
+    llvm::errs() << errorMessage << "\n";
+    return mlir::failure();
+  }
+
+  // Try opening the main transform file if provided.
+  std::unique_ptr<llvm::MemoryBuffer> transformRootFile;
+  if (!clOptions->transformMainFilename.empty()) {
+    if (clOptions->transformMainFilename == clOptions->payloadFilename) {
+      llvm::errs() << "warning: " << clOptions->payloadFilename
+                   << " is provided as both payload and transform file\n";
+    } else {
+      transformRootFile =
+          mlir::openInputFile(clOptions->transformMainFilename, &errorMessage);
+      if (!transformRootFile) {
+        llvm::errs() << errorMessage << "\n";
+        return mlir::failure();
+      }
+    }
+  }
+
+  // Try opening transform library files if provided.
+  SmallVector<std::unique_ptr<llvm::MemoryBuffer>> transformLibraries;
+  transformLibraries.reserve(clOptions->transformLibraryFilenames.size());
+  for (llvm::StringRef filename : clOptions->transformLibraryFilenames) {
+    transformLibraries.emplace_back(
+        mlir::openInputFile(filename, &errorMessage));
+    if (!transformLibraries.back()) {
+      llvm::errs() << errorMessage << "\n";
+      return mlir::failure();
+    }
+  }
+
+  return processPayloadBuffer(outputFile->os(), std::move(payloadFile),
+                              std::move(transformRootFile), transformLibraries,
+                              registry);
+}
+
+int main(int argc, char **argv) {
+  return mlir::asMainReturnCode(runMain(argc, argv));
+}
diff --git a/mlir/include/mlir-c/Dialect/SparseTensor.h b/mlir/include/mlir-c/Dialect/SparseTensor.h
index 898d2f12779e..52ca7ba8a161 100644
--- a/mlir/include/mlir-c/Dialect/SparseTensor.h
+++ b/mlir/include/mlir-c/Dialect/SparseTensor.h
@@ -29,10 +29,11 @@ typedef uint64_t MlirSparseTensorLevelType;
 
 enum MlirSparseTensorLevelFormat {
   MLIR_SPARSE_TENSOR_LEVEL_DENSE = 0x000000010000,
-  MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED = 0x000000020000,
-  MLIR_SPARSE_TENSOR_LEVEL_SINGLETON = 0x000000040000,
-  MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED = 0x000000080000,
-  MLIR_SPARSE_TENSOR_LEVEL_N_OUT_OF_M = 0x000000100000,
+  MLIR_SPARSE_TENSOR_LEVEL_BATCH = 0x000000020000,
+  MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED = 0x000000040000,
+  MLIR_SPARSE_TENSOR_LEVEL_SINGLETON = 0x000000080000,
+  MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED = 0x000000100000,
+  MLIR_SPARSE_TENSOR_LEVEL_N_OUT_OF_M = 0x000000200000,
 };
 
 enum MlirSparseTensorLevelPropertyNondefault {
diff --git a/mlir/include/mlir/Config/mlir-config.h.cmake b/mlir/include/mlir/Config/mlir-config.h.cmake
index e152a36c0ce0..4a7d75e22668 100644
--- a/mlir/include/mlir/Config/mlir-config.h.cmake
+++ b/mlir/include/mlir/Config/mlir-config.h.cmake
@@ -29,4 +29,8 @@
 /* If set, enables PDL usage. */
 #cmakedefine01 MLIR_ENABLE_PDL_IN_PATTERNMATCH
 
+/* If set, enables CUDA-related features in CUDA-related transforms, pipelines,
+   and targets. */
+#cmakedefine01 MLIR_ENABLE_CUDA_CONVERSIONS
+
 #endif
diff --git a/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td b/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td
index 23873d86b495..0eb670506086 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td
@@ -13,8 +13,8 @@ include "mlir/Dialect/Transform/IR/TransformAttrs.td"
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
 include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
 include "mlir/Dialect/Transform/IR/TransformTypes.td"
-include "mlir/Interfaces/SideEffectInterfaces.td"
 
+include "mlir/Interfaces/SideEffectInterfaces.td"
 //===----------------------------------------------------------------------===//
 // ApplyOptimizeSharedMemoryReadsAndWritesOp
 //===----------------------------------------------------------------------===//
@@ -28,7 +28,9 @@ def ApplyOptimizeSharedMemoryReadsAndWritesOp :
     reads/writes with the goal of avoiding bank conflicts.
   }];
 
-  let arguments = (ins TransformHandleTypeInterface:$target);
+  let arguments = (ins TransformHandleTypeInterface:$target,
+                    DefaultValuedOptionalAttr<I64Attr, "128">:$sharedMemoryLineSizeBytes,
+                    DefaultValuedOptionalAttr<I64Attr, "128">:$defaultVectorSizeBits);
   let results = (outs);
 
   let assemblyFormat = "$target attr-dict `:` functional-type(operands, results)";
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
index c8059e6d316e..67f951fd19d1 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
@@ -37,10 +37,17 @@ def OptimizeSharedMemory : Pass<"amdgpu-optimize-shared-memory"> {
     attempts to optimize reads/writes from a memref representing GPU shared
     memory in order to avoid bank conflicts.
   }];
-
   let dependentDialects = [
     "memref::MemRefDialect", "vector::VectorDialect"
   ];
+  let options = [
+    Option<"sharedMemoryLineSizeBytes", "shared-memory-line-size-bytes", "int64_t",
+           /*default=*/"128",
+           "Shared memory line size in bytes">,
+    Option<"defaultVectorSizeBits", "default-vector-size-bits", "int64_t",
+           /*default=*/"128",
+           "Default vector size in bits">,
+  ];
 }
 
 #endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
index 79f9ab71a2b4..843cea2c503b 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
@@ -45,11 +45,15 @@ namespace amdgpu {
 /// function that depends on the row Index. The permutation function is chosen
 /// to ensure that sequential distributed+vectorized reads/writes down a single
 /// dimension of the memref have minimal conflicts.
-LogicalResult optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
-                                                 Value memrefValue);
+LogicalResult
+optimizeSharedMemoryReadsAndWrites(Operation *parentOp, Value memrefValue,
+                                   int64_t sharedMemoryLineSizeBytes,
+                                   int64_t defaultVectorSizeBits);
 
 std::optional<LogicalResult>
-optimizeSharedMemoryReadsAndWritesOp(func::FuncOp funcOp);
+optimizeSharedMemoryReadsAndWritesOp(func::FuncOp funcOp,
+                                     int64_t sharedMemoryLineSizeBytes,
+                                     int64_t defaultVectorSizeBits);
 
 } // namespace amdgpu
 } // namespace mlir
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 6b170c8d06f4..53e9f2dc6a99 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -28,6 +28,22 @@ def ROCDL_Dialect : Dialect {
   let hasOperationAttrVerify = 1;
 
   let extraClassDeclaration = [{
+    /// Get the name of the attribute used to annotate external kernel
+    /// functions.
+    static StringRef getKernelFuncAttrName() { return "rocdl.kernel"; }
+    static constexpr ::llvm::StringLiteral getFlatWorkGroupSizeAttrName() {
+      return ::llvm::StringLiteral("rocdl.flat_work_group_size");
+    }
+    static constexpr ::llvm::StringLiteral getReqdWorkGroupSizeAttrName() {
+      return ::llvm::StringLiteral("rocdl.reqd_work_group_size");
+    }
+    /// MLIR's gpu-related infrastructure effectively assume uniform workgroup
+    /// sizes, so this attribute defaults to "true" on `rocdl.kernel` functions.
+    /// It is provided here to allow overriding this assumption.
+    static constexpr ::llvm::StringLiteral getUniformWorkGroupSizeAttrName() {
+      return ::llvm::StringLiteral("rocdl.uniform_work_group_size");
+    }
+
     /// The address space value that represents global memory.
     static constexpr unsigned kGlobalMemoryAddressSpace = 1;
     /// The address space value that represents shared memory.
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 309573a56287..53ed31877c6f 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -2296,6 +2296,49 @@ def ConvertConv2DToImg2ColOp : Op<Transform_Dialect,
 }
 
 //===----------------------------------------------------------------------===//
+// FlattenElementwiseLinalgOp
+//===----------------------------------------------------------------------===//
+
+def FlattenElementwiseLinalgOp : Op<Transform_Dialect,
+    "structured.flatten_elementwise",
+    [FunctionalStyleTransformOpTrait,
+     MemoryEffectsOpInterface,
+     TransformOpInterface,
+     TransformEachOpTrait,
+     ReportTrackingListenerFailuresOpTrait]> {
+  let description = [{
+    Flattens the iteration space and (applicable) operands of elementwise
+    linalg ops to a single dimension.
+
+    Returns one handle:
+    - Flattened linalg operation.
+
+    #### Return modes:
+
+    Returns a definite failure if target is not isolated from above.
+    Returns a silenceable failure if the pattern application failed.
+  }];
+
+  let arguments = (ins TransformHandleTypeInterface:$target);
+  let results = (outs TransformHandleTypeInterface:$transformed);
+
+  let assemblyFormat =
+    "$target attr-dict `:` functional-type($target, results)";
+
+  let builders = [
+    OpBuilder<(ins "Value":$target)>
+  ];
+
+  let extraClassDeclaration = [{
+    ::mlir::DiagnosedSilenceableFailure applyToOne(
+        ::mlir::transform::TransformRewriter &rewriter,
+        ::mlir::linalg::LinalgOp target,
+        ::mlir::transform::ApplyToEachResultList &results,
+        ::mlir::transform::TransformState &state);
+  }];
+}
+
+//===----------------------------------------------------------------------===//
 // Transpose Conv2D
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index a848d12fbbb5..65cf19e7a4fc 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -1074,6 +1074,11 @@ bool isDimSequencePreserved(AffineMap map, ReassociationIndicesRef dimSequence);
 bool areDimSequencesPreserved(ArrayRef<AffineMap> maps,
                               ArrayRef<ReassociationIndices> dimSequences);
 
+struct CollapseResult {
+  SmallVector<Value> results;
+  LinalgOp collapsedOp;
+};
+
 /// Collapses dimensions of linalg.generic/linalg.copy operation. A precondition
 /// to calling this method is that for each list in `foldedIterationDim`, the
 /// sequence of dimensions is contiguous in domains of all `indexing_maps` of
@@ -1081,9 +1086,8 @@ bool areDimSequencesPreserved(ArrayRef<AffineMap> maps,
 /// When valid, the method also collapses the operands of the op. Returns
 /// replacement values of the results of the original `linalgOp` by inserting
 /// reshapes to get back values of compatible types.
-template <typename LinalgType>
-FailureOr<SmallVector<Value>>
-collapseOpIterationDims(LinalgType op,
+FailureOr<CollapseResult>
+collapseOpIterationDims(LinalgOp op,
                         ArrayRef<ReassociationIndices> foldedIterationDims,
                         RewriterBase &rewriter);
 
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
index 1c81d80ea7ec..5563cb907e93 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
@@ -154,12 +154,26 @@ enum class Action : uint32_t {
 enum class LevelFormat : uint64_t {
   Undef = 0x00000000,
   Dense = 0x00010000,
-  Compressed = 0x00020000,
-  Singleton = 0x00040000,
-  LooseCompressed = 0x00080000,
-  NOutOfM = 0x00100000,
+  Batch = 0x00020000,
+  Compressed = 0x00040000,
+  Singleton = 0x00080000,
+  LooseCompressed = 0x00100000,
+  NOutOfM = 0x00200000,
 };
 
+constexpr bool encPowOfTwo(LevelFormat fmt) {
+  auto enc = static_cast<std::underlying_type_t<LevelFormat>>(fmt);
+  return (enc & (enc - 1)) == 0;
+}
+
+// All LevelFormats must have only one bit set (power of two).
+static_assert(encPowOfTwo(LevelFormat::Dense) &&
+              encPowOfTwo(LevelFormat::Batch) &&
+              encPowOfTwo(LevelFormat::Compressed) &&
+              encPowOfTwo(LevelFormat::Singleton) &&
+              encPowOfTwo(LevelFormat::LooseCompressed) &&
+              encPowOfTwo(LevelFormat::NOutOfM));
+
 template <LevelFormat... targets>
 constexpr bool isAnyOfFmt(LevelFormat fmt) {
   return (... || (targets == fmt));
@@ -172,6 +186,8 @@ constexpr const char *toFormatString(LevelFormat lvlFmt) {
     return "undef";
   case LevelFormat::Dense:
     return "dense";
+  case LevelFormat::Batch:
+    return "batch";
   case LevelFormat::Compressed:
     return "compressed";
   case LevelFormat::Singleton:
@@ -225,10 +241,10 @@ public:
   static constexpr bool isValidLvlBits(uint64_t lvlBits) {
     auto fmt = static_cast<LevelFormat>(lvlBits & 0xffff0000);
     const uint64_t propertyBits = lvlBits & 0xffff;
-    // If undefined/dense/NOutOfM, then must be unique and ordered.
+    // If undefined/dense/batch/NOutOfM, then must be unique and ordered.
     // Otherwise, the format must be one of the known ones.
     return (isAnyOfFmt<LevelFormat::Undef, LevelFormat::Dense,
-                       LevelFormat::NOutOfM>(fmt))
+                       LevelFormat::Batch, LevelFormat::NOutOfM>(fmt))
                ? (propertyBits == 0)
                : (isAnyOfFmt<LevelFormat::Compressed, LevelFormat::Singleton,
                              LevelFormat::LooseCompressed>(fmt));
@@ -317,16 +333,28 @@ public:
     return lvlBits & static_cast<uint64_t>(p);
   }
 
+  /// Check if the `LevelType` is considered to be sparse.
+  constexpr bool hasSparseSemantic() const {
+    return isa<LevelFormat::Compressed, LevelFormat::Singleton,
+               LevelFormat::LooseCompressed, LevelFormat::NOutOfM>();
+  }
+
+  /// Check if the `LevelType` is considered to be dense-like.
+  constexpr bool hasDenseSemantic() const {
+    return isa<LevelFormat::Dense, LevelFormat::Batch>();
+  }
+
   /// Check if the `LevelType` needs positions array.
   constexpr bool isWithPosLT() const {
-    return isa<LevelFormat::Compressed>() ||
-           isa<LevelFormat::LooseCompressed>();
+    assert(!isa<LevelFormat::Undef>());
+    return isa<LevelFormat::Compressed, LevelFormat::LooseCompressed>();
   }
 
   /// Check if the `LevelType` needs coordinates array.
   constexpr bool isWithCrdLT() const {
+    assert(!isa<LevelFormat::Undef>());
     // All sparse levels has coordinate array.
-    return !isa<LevelFormat::Dense>();
+    return hasSparseSemantic();
   }
 
   std::string toMLIRString() const {
@@ -375,6 +403,7 @@ inline std::optional<LevelType> buildLevelType(LevelFormat lf, bool ordered,
 }
 inline bool isUndefLT(LevelType lt) { return lt.isa<LevelFormat::Undef>(); }
 inline bool isDenseLT(LevelType lt) { return lt.isa<LevelFormat::Dense>(); }
+inline bool isBatchLT(LevelType lt) { return lt.isa<LevelFormat::Batch>(); }
 inline bool isCompressedLT(LevelType lt) {
   return lt.isa<LevelFormat::Compressed>();
 }
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
index f0b832571e68..5d1db2323f95 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
@@ -141,7 +141,8 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding",
 
     The supported level-formats are the following:
 
-    - **dense** : all entries along this level are stored
+    - **dense** : all entries along this level are stored and linearized.
+    - **batch** : all entries along this level are stored but not linearized.
     - **compressed** : only nonzeros along this level are stored
     - **loose_compressed** : as compressed, but allows for free space between regions
     - **singleton** : a variant of the compressed format, where coordinates have no siblings
@@ -373,6 +374,8 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding",
     /// is non-null (since no fixed result is valid for every dense-tensor).
     ::mlir::sparse_tensor::Level getLvlRank() const;
 
+    uint64_t getBatchLvlRank() const;
+
     //
     // lvlTypes methods.
     //
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorStorageLayout.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorStorageLayout.h
index 27dc39609cda..ce34ae43d1c1 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorStorageLayout.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorStorageLayout.h
@@ -30,15 +30,15 @@ namespace sparse_tensor {
 ///   ;  if dense:
 ///        <nothing>
 ///   ;  if compressed:
-///        memref<? x pos>  positions   ; positions for level l
-///        memref<? x crd>  coordinates ; coordinates for level l
-///   ;  if loose-compressed:
-///        memref<? x pos>  positions   ; lo/hi position pairs for level l
-///        memref<? x crd>  coordinates ; coordinates for level l
+///        memref<[batch] x ? x pos>  positions   ; positions for level l
+///        memref<[batch] x ? x crd>  coordinates ; coordinates for level l
+///   ;  if loose-[batch] x compressed:
+///        memref<[batch] x ? x pos>  positions   ; lo/hi pos pairs for level l
+///        memref<[batch] x ? x crd>  coordinates ; coordinates for level l
 ///   ;  if singleton/2-out-of-4:
-///        memref<? x crd>  coordinates ; coordinates for level l
+///        memref<[batch] x ? x crd>  coordinates ; coordinates for level l
 ///
-///   memref<? x eltType> values        ; values
+///   memref<[batch] x ? x eltType> values        ; values
 ///
 ///   struct sparse_tensor.storage_specifier {
 ///     array<rank x int> lvlSizes    ; sizes/cardinalities for each level
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
index 1a090ddb782f..c93a4fcd922c 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
@@ -253,6 +253,14 @@ public:
                                        CrdTransDirectionKind::dim2lvl);
   }
 
+  /// Returns the Level-shape.
+  SmallVector<Size> getBatchLvlShape() const {
+    auto lvlShape = getEncoding().tranlateShape(getDimShape(),
+                                                CrdTransDirectionKind::dim2lvl);
+    lvlShape.truncate(getEncoding().getBatchLvlRank());
+    return lvlShape;
+  }
+
   /// Returns the type with an identity mapping.
   RankedTensorType getDemappedType() const {
     return RankedTensorType::get(getLvlShape(), getElementType(),
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
index 490ef3071af1..7f9820df984b 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
@@ -509,8 +509,7 @@ public:
   bool isSparseLvlWithNonTrivialIdxExp(TensorLoopId b) const {
     if (isLvlWithNonTrivialIdxExp(b)) {
       auto lt = getLoopDependentLevelType(b);
-      return isCompressedLT(lt) || isSingletonLT(lt) ||
-             isLooseCompressedLT(lt) || isNOutOfMLT(lt);
+      return lt.hasSparseSemantic();
     }
     return false;
   }
diff --git a/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td b/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td
index 1d6eb24156e3..86a2b3c21faf 100644
--- a/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td
@@ -66,7 +66,25 @@ def InterpreterPass : Pass<"transform-interpreter"> {
   let description = [{
     This pass runs the transform dialect interpreter and applies the named
     sequence transformation specified by the provided name (defaults to
-    `TransformDialect::kTransformEntryPointSymbolName` (i.e. `__transform_main`)).
+    `TransformDialect::kTransformEntryPointSymbolName`,
+    i.e. `__transform_main`).
+
+    Additional options can be used to narrow down the pass applicability for
+    debugging purposes:
+      * `debugPayloadRootTag` makes the transform script apply to the payload
+        operation that has a `transform.target_tag` string attribute with the
+        given value, rather than to the anchor operation of the pass.
+      * `debugBindTrailingArgs` allows one to bind values to trailing arguments
+        of the transform entry point as follows:
+        * arguments of `TransformHandleTypeInterface` type can be bound to all
+          payload operations with the name provided as a simple string;
+        * arguments of `TransformValueHandleTypeInterface` type can be bound to
+          a flattened list of results of all operations with the name provided
+          as a string prefixed with `^`;
+        * arguments of `TransformParamTypeInterface` type can be bound to
+          integer constants provided as `;`-separated list prefixed with `#`.
+      * `entryPoint` specifies the name of the transform symbol to serve as the
+        entry point.
   }];
   let dependentDialects = ["::mlir::transform::TransformDialect"];
   let options = [
@@ -83,7 +101,9 @@ def InterpreterPass : Pass<"transform-interpreter"> {
            "false",
            "Disable expensive checks in the interpreter for a faster run.">,
     Option<"entryPoint", "entry-point", "std::string",
-           /*default=*/[{TransformDialect::kTransformEntryPointSymbolName.str()}],
+           /*default=*/[{
+              TransformDialect::kTransformEntryPointSymbolName.str()
+            }],
            "Entry point of the pass pipeline.">,
   ];
 }
diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h
index 2ce3bc3fc2e7..f8d22cfb22af 100644
--- a/mlir/include/mlir/IR/PatternMatch.h
+++ b/mlir/include/mlir/IR/PatternMatch.h
@@ -579,7 +579,7 @@ public:
 
   /// Split the operations starting at "before" (inclusive) out of the given
   /// block into a new block, and return it.
-  virtual Block *splitBlock(Block *block, Block::iterator before);
+  Block *splitBlock(Block *block, Block::iterator before);
 
   /// Unlink this operation from its current block and insert it right before
   /// `existingOp` which may be in the same or another block in the same
diff --git a/mlir/include/mlir/IR/Value.h b/mlir/include/mlir/IR/Value.h
index fff3b87faff6..a74d0faa1dfc 100644
--- a/mlir/include/mlir/IR/Value.h
+++ b/mlir/include/mlir/IR/Value.h
@@ -90,7 +90,7 @@ protected:
 /// class has value-type semantics and is just a simple wrapper around a
 /// ValueImpl that is either owner by a block(in the case of a BlockArgument) or
 /// an Operation(in the case of an OpResult).
-/// As most IR construct, this isn't const-correct, but we keep method
+/// As most IR constructs, this isn't const-correct, but we keep method
 /// consistent and as such method that immediately modify this Value aren't
 /// marked `const` (include modifying the Value use-list).
 class Value {
diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h
index e28921619fe5..5d90c197a6cc 100644
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -14,6 +14,7 @@
 #ifndef MLIR_INITALLPASSES_H_
 #define MLIR_INITALLPASSES_H_
 
+#include "mlir/Config/mlir-config.h"
 #include "mlir/Conversion/Passes.h"
 #include "mlir/Dialect/AMDGPU/Transforms/Passes.h"
 #include "mlir/Dialect/Affine/Passes.h"
@@ -96,7 +97,7 @@ inline void registerAllPasses() {
   bufferization::registerBufferizationPipelines();
   sparse_tensor::registerSparseTensorPipelines();
   tosa::registerTosaToLinalgPipelines();
-#if MLIR_CUDA_CONVERSIONS_ENABLED
+#if MLIR_ENABLE_CUDA_CONVERSIONS
   gpu::registerGPUToNVVMPipeline();
 #endif
 }
diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index 7e8e67a9d178..84396529eb7c 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -741,9 +741,6 @@ public:
   /// implemented for dialect conversion.
   void eraseBlock(Block *block) override;
 
-  /// PatternRewriter hook for splitting a block into two parts.
-  Block *splitBlock(Block *block, Block::iterator before) override;
-
   /// PatternRewriter hook for inlining the ops of a block into another block.
   void inlineBlockBefore(Block *source, Block *dest, Block::iterator before,
                          ValueRange argValues = std::nullopt) override;
diff --git a/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp b/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
index e69f9c837ca1..10ccd5c97783 100644
--- a/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
+++ b/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/Transforms/Transforms.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
@@ -558,6 +559,7 @@ class LowerAffinePass
     RewritePatternSet patterns(&getContext());
     populateAffineToStdConversionPatterns(patterns);
     populateAffineToVectorConversionPatterns(patterns);
+    populateAffineExpandIndexOpsPatterns(patterns);
     ConversionTarget target(getContext());
     target.addLegalDialect<arith::ArithDialect, memref::MemRefDialect,
                            scf::SCFDialect, VectorDialect>();
diff --git a/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt b/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt
index 2ba0f30b1190..f41e3ca27ee4 100644
--- a/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt
+++ b/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt
@@ -12,12 +12,13 @@ add_mlir_conversion_library(MLIRAffineToStandard
 
   LINK_LIBS PUBLIC
   MLIRAffineDialect
+  MLIRAffineTransforms
   MLIRAffineUtils
   MLIRArithDialect
   MLIRIR
   MLIRMemRefDialect
-  MLIRSCFDialect
   MLIRPass
+  MLIRSCFDialect
   MLIRTransforms
   MLIRVectorDialect
   )
diff --git a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
index cc315110f9be..20525b962ae9 100644
--- a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
+++ b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
@@ -257,6 +257,7 @@ struct DivOpConversion : public OpConversionPattern<complex::DivOp> {
     auto loc = op.getLoc();
     auto type = cast<ComplexType>(adaptor.getLhs().getType());
     auto elementType = cast<FloatType>(type.getElementType());
+    arith::FastMathFlagsAttr fmf = op.getFastMathFlagsAttr();
 
     Value lhsReal =
         rewriter.create<complex::ReOp>(loc, elementType, adaptor.getLhs());
@@ -290,45 +291,51 @@ struct DivOpConversion : public OpConversionPattern<complex::DivOp> {
     //
     // See https://dl.acm.org/citation.cfm?id=368661 for more details.
     Value rhsRealImagRatio =
-        rewriter.create<arith::DivFOp>(loc, rhsReal, rhsImag);
+        rewriter.create<arith::DivFOp>(loc, rhsReal, rhsImag, fmf);
     Value rhsRealImagDenom = rewriter.create<arith::AddFOp>(
         loc, rhsImag,
-        rewriter.create<arith::MulFOp>(loc, rhsRealImagRatio, rhsReal));
+        rewriter.create<arith::MulFOp>(loc, rhsRealImagRatio, rhsReal, fmf),
+        fmf);
     Value realNumerator1 = rewriter.create<arith::AddFOp>(
-        loc, rewriter.create<arith::MulFOp>(loc, lhsReal, rhsRealImagRatio),
-        lhsImag);
-    Value resultReal1 =
-        rewriter.create<arith::DivFOp>(loc, realNumerator1, rhsRealImagDenom);
+        loc,
+        rewriter.create<arith::MulFOp>(loc, lhsReal, rhsRealImagRatio, fmf),
+        lhsImag, fmf);
+    Value resultReal1 = rewriter.create<arith::DivFOp>(loc, realNumerator1,
+                                                       rhsRealImagDenom, fmf);
     Value imagNumerator1 = rewriter.create<arith::SubFOp>(
-        loc, rewriter.create<arith::MulFOp>(loc, lhsImag, rhsRealImagRatio),
-        lhsReal);
-    Value resultImag1 =
-        rewriter.create<arith::DivFOp>(loc, imagNumerator1, rhsRealImagDenom);
+        loc,
+        rewriter.create<arith::MulFOp>(loc, lhsImag, rhsRealImagRatio, fmf),
+        lhsReal, fmf);
+    Value resultImag1 = rewriter.create<arith::DivFOp>(loc, imagNumerator1,
+                                                       rhsRealImagDenom, fmf);
 
     Value rhsImagRealRatio =
-        rewriter.create<arith::DivFOp>(loc, rhsImag, rhsReal);
+        rewriter.create<arith::DivFOp>(loc, rhsImag, rhsReal, fmf);
     Value rhsImagRealDenom = rewriter.create<arith::AddFOp>(
         loc, rhsReal,
-        rewriter.create<arith::MulFOp>(loc, rhsImagRealRatio, rhsImag));
+        rewriter.create<arith::MulFOp>(loc, rhsImagRealRatio, rhsImag, fmf),
+        fmf);
     Value realNumerator2 = rewriter.create<arith::AddFOp>(
         loc, lhsReal,
-        rewriter.create<arith::MulFOp>(loc, lhsImag, rhsImagRealRatio));
-    Value resultReal2 =
-        rewriter.create<arith::DivFOp>(loc, realNumerator2, rhsImagRealDenom);
+        rewriter.create<arith::MulFOp>(loc, lhsImag, rhsImagRealRatio, fmf),
+        fmf);
+    Value resultReal2 = rewriter.create<arith::DivFOp>(loc, realNumerator2,
+                                                       rhsImagRealDenom, fmf);
     Value imagNumerator2 = rewriter.create<arith::SubFOp>(
         loc, lhsImag,
-        rewriter.create<arith::MulFOp>(loc, lhsReal, rhsImagRealRatio));
-    Value resultImag2 =
-        rewriter.create<arith::DivFOp>(loc, imagNumerator2, rhsImagRealDenom);
+        rewriter.create<arith::MulFOp>(loc, lhsReal, rhsImagRealRatio, fmf),
+        fmf);
+    Value resultImag2 = rewriter.create<arith::DivFOp>(loc, imagNumerator2,
+                                                       rhsImagRealDenom, fmf);
 
     // Consider corner cases.
     // Case 1. Zero denominator, numerator contains at most one NaN value.
     Value zero = rewriter.create<arith::ConstantOp>(
         loc, elementType, rewriter.getZeroAttr(elementType));
-    Value rhsRealAbs = rewriter.create<math::AbsFOp>(loc, rhsReal);
+    Value rhsRealAbs = rewriter.create<math::AbsFOp>(loc, rhsReal, fmf);
     Value rhsRealIsZero = rewriter.create<arith::CmpFOp>(
         loc, arith::CmpFPredicate::OEQ, rhsRealAbs, zero);
-    Value rhsImagAbs = rewriter.create<math::AbsFOp>(loc, rhsImag);
+    Value rhsImagAbs = rewriter.create<math::AbsFOp>(loc, rhsImag, fmf);
     Value rhsImagIsZero = rewriter.create<arith::CmpFOp>(
         loc, arith::CmpFPredicate::OEQ, rhsImagAbs, zero);
     Value lhsRealIsNotNaN = rewriter.create<arith::CmpFOp>(
@@ -347,9 +354,9 @@ struct DivOpConversion : public OpConversionPattern<complex::DivOp> {
     Value infWithSignOfRhsReal =
         rewriter.create<math::CopySignOp>(loc, inf, rhsReal);
     Value infinityResultReal =
-        rewriter.create<arith::MulFOp>(loc, infWithSignOfRhsReal, lhsReal);
+        rewriter.create<arith::MulFOp>(loc, infWithSignOfRhsReal, lhsReal, fmf);
     Value infinityResultImag =
-        rewriter.create<arith::MulFOp>(loc, infWithSignOfRhsReal, lhsImag);
+        rewriter.create<arith::MulFOp>(loc, infWithSignOfRhsReal, lhsImag, fmf);
 
     // Case 2. Infinite numerator, finite denominator.
     Value rhsRealFinite = rewriter.create<arith::CmpFOp>(
@@ -358,10 +365,10 @@ struct DivOpConversion : public OpConversionPattern<complex::DivOp> {
         loc, arith::CmpFPredicate::ONE, rhsImagAbs, inf);
     Value rhsFinite =
         rewriter.create<arith::AndIOp>(loc, rhsRealFinite, rhsImagFinite);
-    Value lhsRealAbs = rewriter.create<math::AbsFOp>(loc, lhsReal);
+    Value lhsRealAbs = rewriter.create<math::AbsFOp>(loc, lhsReal, fmf);
     Value lhsRealInfinite = rewriter.create<arith::CmpFOp>(
         loc, arith::CmpFPredicate::OEQ, lhsRealAbs, inf);
-    Value lhsImagAbs = rewriter.create<math::AbsFOp>(loc, lhsImag);
+    Value lhsImagAbs = rewriter.create<math::AbsFOp>(loc, lhsImag, fmf);
     Value lhsImagInfinite = rewriter.create<arith::CmpFOp>(
         loc, arith::CmpFPredicate::OEQ, lhsImagAbs, inf);
     Value lhsInfinite =
@@ -377,21 +384,23 @@ struct DivOpConversion : public OpConversionPattern<complex::DivOp> {
         loc, rewriter.create<arith::SelectOp>(loc, lhsImagInfinite, one, zero),
         lhsImag);
     Value lhsRealIsInfWithSignTimesRhsReal =
-        rewriter.create<arith::MulFOp>(loc, lhsRealIsInfWithSign, rhsReal);
+        rewriter.create<arith::MulFOp>(loc, lhsRealIsInfWithSign, rhsReal, fmf);
     Value lhsImagIsInfWithSignTimesRhsImag =
-        rewriter.create<arith::MulFOp>(loc, lhsImagIsInfWithSign, rhsImag);
+        rewriter.create<arith::MulFOp>(loc, lhsImagIsInfWithSign, rhsImag, fmf);
     Value resultReal3 = rewriter.create<arith::MulFOp>(
         loc, inf,
         rewriter.create<arith::AddFOp>(loc, lhsRealIsInfWithSignTimesRhsReal,
-                                       lhsImagIsInfWithSignTimesRhsImag));
+                                       lhsImagIsInfWithSignTimesRhsImag, fmf),
+        fmf);
     Value lhsRealIsInfWithSignTimesRhsImag =
-        rewriter.create<arith::MulFOp>(loc, lhsRealIsInfWithSign, rhsImag);
+        rewriter.create<arith::MulFOp>(loc, lhsRealIsInfWithSign, rhsImag, fmf);
     Value lhsImagIsInfWithSignTimesRhsReal =
-        rewriter.create<arith::MulFOp>(loc, lhsImagIsInfWithSign, rhsReal);
+        rewriter.create<arith::MulFOp>(loc, lhsImagIsInfWithSign, rhsReal, fmf);
     Value resultImag3 = rewriter.create<arith::MulFOp>(
         loc, inf,
         rewriter.create<arith::SubFOp>(loc, lhsImagIsInfWithSignTimesRhsReal,
-                                       lhsRealIsInfWithSignTimesRhsImag));
+                                       lhsRealIsInfWithSignTimesRhsImag, fmf),
+        fmf);
 
     // Case 3: Finite numerator, infinite denominator.
     Value lhsRealFinite = rewriter.create<arith::CmpFOp>(
@@ -415,21 +424,23 @@ struct DivOpConversion : public OpConversionPattern<complex::DivOp> {
         loc, rewriter.create<arith::SelectOp>(loc, rhsImagInfinite, one, zero),
         rhsImag);
     Value rhsRealIsInfWithSignTimesLhsReal =
-        rewriter.create<arith::MulFOp>(loc, lhsReal, rhsRealIsInfWithSign);
+        rewriter.create<arith::MulFOp>(loc, lhsReal, rhsRealIsInfWithSign, fmf);
     Value rhsImagIsInfWithSignTimesLhsImag =
-        rewriter.create<arith::MulFOp>(loc, lhsImag, rhsImagIsInfWithSign);
+        rewriter.create<arith::MulFOp>(loc, lhsImag, rhsImagIsInfWithSign, fmf);
     Value resultReal4 = rewriter.create<arith::MulFOp>(
         loc, zero,
         rewriter.create<arith::AddFOp>(loc, rhsRealIsInfWithSignTimesLhsReal,
-                                       rhsImagIsInfWithSignTimesLhsImag));
+                                       rhsImagIsInfWithSignTimesLhsImag, fmf),
+        fmf);
     Value rhsRealIsInfWithSignTimesLhsImag =
-        rewriter.create<arith::MulFOp>(loc, lhsImag, rhsRealIsInfWithSign);
+        rewriter.create<arith::MulFOp>(loc, lhsImag, rhsRealIsInfWithSign, fmf);
     Value rhsImagIsInfWithSignTimesLhsReal =
-        rewriter.create<arith::MulFOp>(loc, lhsReal, rhsImagIsInfWithSign);
+        rewriter.create<arith::MulFOp>(loc, lhsReal, rhsImagIsInfWithSign, fmf);
     Value resultImag4 = rewriter.create<arith::MulFOp>(
         loc, zero,
         rewriter.create<arith::SubFOp>(loc, rhsRealIsInfWithSignTimesLhsImag,
-                                       rhsImagIsInfWithSignTimesLhsReal));
+                                       rhsImagIsInfWithSignTimesLhsReal, fmf),
+        fmf);
 
     Value realAbsSmallerThanImagAbs = rewriter.create<arith::CmpFOp>(
         loc, arith::CmpFPredicate::OLT, rhsRealAbs, rhsImagAbs);
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 4fa3cdcbf85c..f425b1f59d99 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -75,6 +75,11 @@ Value getLaneId(ConversionPatternRewriter &rewriter, Location loc,
                                                    ValueRange{minus1, mbcntLo});
   return laneId;
 }
+static constexpr StringLiteral amdgcnDataLayout =
+    "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
+    "-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:"
+    "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-"
+    "G1-ni:7:8";
 
 namespace {
 struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
@@ -212,6 +217,12 @@ struct LowerGpuOpsToROCDLOpsPass
     gpu::GPUModuleOp m = getOperation();
     MLIRContext *ctx = m.getContext();
 
+    auto llvmDataLayout = m->getAttrOfType<StringAttr>(
+        LLVM::LLVMDialect::getDataLayoutAttrName());
+    if (!llvmDataLayout) {
+      llvmDataLayout = StringAttr::get(ctx, amdgcnDataLayout);
+      m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout);
+    }
     // Request C wrapper emission.
     for (auto func : m.getOps<func::FuncOp>()) {
       func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
@@ -227,6 +238,7 @@ struct LowerGpuOpsToROCDLOpsPass
     /// Customize the bitwidth used for the device side index computations.
     LowerToLLVMOptions options(
         ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
+    options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue());
     if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
       options.overrideIndexBitwidth(indexBitwidth);
 
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 19cc914efae0..337f8bb6ab99 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1532,7 +1532,8 @@ public:
     auto punct = printOp.getPunctuation();
     if (auto stringLiteral = printOp.getStringLiteral()) {
       LLVM::createPrintStrCall(rewriter, loc, parent, "vector_print_str",
-                               *stringLiteral, *getTypeConverter());
+                               *stringLiteral, *getTypeConverter(),
+                               /*addNewline=*/false);
     } else if (punct != PrintPunctuation::NoPunctuation) {
       emitCall(rewriter, printOp->getLoc(), [&] {
         switch (punct) {
diff --git a/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp b/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp
index ff29f9f69385..b7e17a928973 100644
--- a/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp
@@ -27,7 +27,8 @@ DiagnosedSilenceableFailure
 ApplyOptimizeSharedMemoryReadsAndWritesOp::applyToOne(
     TransformRewriter &rewriter, FuncOp funcOp, ApplyToEachResultList &results,
     TransformState &state) {
-  optimizeSharedMemoryReadsAndWritesOp(funcOp);
+  optimizeSharedMemoryReadsAndWritesOp(funcOp, getSharedMemoryLineSizeBytes(),
+                                       getDefaultVectorSizeBits());
   return DiagnosedSilenceableFailure::success();
 }
 
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
index 6bd03ed83389..32fab265e03c 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
@@ -35,13 +35,6 @@ namespace amdgpu {
 using namespace mlir;
 using namespace mlir::amdgpu;
 
-/// The size of a shared memory line according to AMD documentation.
-/// https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/instinct-mi200-cdna2-instruction-set-architecture.pdf
-constexpr int64_t kSharedMemoryLineSizeBytes = 64;
-/// We optimize for 64bit accesses, but this can be made an argument in the
-/// future.
-constexpr int64_t kDefaultVectorSizeBits = 64;
-
 /// Uses `srcIndexValue` to permute `tgtIndexValue` via
 /// `result = xor(floordiv(srcIdxVal,permuteEveryN),
 ///               floordiv(tgtIdxVal,vectorSize)))
@@ -49,7 +42,9 @@ constexpr int64_t kDefaultVectorSizeBits = 64;
 /// This is done using an optimized sequence of `arith` operations.
 static Value permuteVectorOffset(OpBuilder &b, Location loc,
                                  ArrayRef<Value> indices, MemRefType memrefTy,
-                                 int64_t srcDim, int64_t tgtDim) {
+                                 int64_t srcDim, int64_t tgtDim,
+                                 int64_t sharedMemoryLineSizeBytes,
+                                 int64_t defaultVectorSizeBits) {
   // Adjust the src index to change how often the permutation changes
   // if necessary.
   Value src = indices[srcDim];
@@ -57,9 +52,9 @@ static Value permuteVectorOffset(OpBuilder &b, Location loc,
   // We only want to permute every N iterations of the target dim where N is
   // ceil(sharedMemoryLineSizeBytes / dimSizeBytes(tgtDim)).
   const int64_t permuteEveryN = std::max<int64_t>(
-      1, kSharedMemoryLineSizeBytes / ((memrefTy.getDimSize(tgtDim) *
-                                        memrefTy.getElementTypeBitWidth()) /
-                                       8));
+      1, sharedMemoryLineSizeBytes / ((memrefTy.getDimSize(tgtDim) *
+                                       memrefTy.getElementTypeBitWidth()) /
+                                      8));
 
   // clang-format off
   // Index bit representation (b0 = least significant bit) for dim(1)
@@ -71,7 +66,7 @@ static Value permuteVectorOffset(OpBuilder &b, Location loc,
   // bits[N:M] = vector index
   // clang-format on
   int64_t n =
-      llvm::Log2_64(kDefaultVectorSizeBits / memrefTy.getElementTypeBitWidth());
+      llvm::Log2_64(defaultVectorSizeBits / memrefTy.getElementTypeBitWidth());
   int64_t m = llvm::Log2_64(memrefTy.getDimSize(tgtDim));
 
   // Capture bits[0:(M-N)] of src by first creating a (M-N) mask.
@@ -105,9 +100,11 @@ static Value permuteVectorOffset(OpBuilder &b, Location loc,
 static void transformIndices(OpBuilder &builder, Location loc,
                              SmallVector<Value, 4> &indices,
                              MemRefType memrefTy, int64_t srcDim,
-                             int64_t tgtDim) {
+                             int64_t tgtDim, int64_t sharedMemoryLineSizeBytes,
+                             int64_t defaultVectorSizeBits) {
   indices[tgtDim] =
-      permuteVectorOffset(builder, loc, indices, memrefTy, srcDim, tgtDim);
+      permuteVectorOffset(builder, loc, indices, memrefTy, srcDim, tgtDim,
+                          sharedMemoryLineSizeBytes, defaultVectorSizeBits);
 }
 
 // Return all operations within `parentOp` that read from or write to
@@ -149,8 +146,9 @@ getShmReadAndWriteOps(Operation *parentOp, Value shmMemRef,
   return success();
 }
 
-LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
-                                                         Value memrefValue) {
+LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(
+    Operation *parentOp, Value memrefValue, int64_t sharedMemoryLineSizeBytes,
+    int64_t defaultVectorSizeBits) {
   auto memRefType = dyn_cast<MemRefType>(memrefValue.getType());
   if (!memRefType ||
       !amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(memRefType))
@@ -167,10 +165,10 @@ LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
   // If dim[rank-1] is small enough to fit 8 rows in a 128B line.
   const int64_t rowSize = memRefType.getDimSize(memRefType.getRank() - 1);
   const int64_t rowsPerLine =
-      (8 * kSharedMemoryLineSizeBytes / memRefType.getElementTypeBitWidth()) /
+      (8 * sharedMemoryLineSizeBytes / memRefType.getElementTypeBitWidth()) /
       rowSize;
   const int64_t threadGroupSize =
-      1LL << (7 - llvm::Log2_64(kDefaultVectorSizeBits / 8));
+      1LL << (7 - llvm::Log2_64(defaultVectorSizeBits / 8));
   if (rowsPerLine >= threadGroupSize)
     return failure();
 
@@ -198,7 +196,8 @@ LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
     auto indices = amdgpu::getIndices(shmWriteOp);
     SmallVector<Value, 4> transformedIndices(indices->begin(), indices->end());
     transformIndices(builder, shmWriteOp->getLoc(), transformedIndices,
-                     memRefType, srcDim, tgtDim);
+                     memRefType, srcDim, tgtDim, sharedMemoryLineSizeBytes,
+                     defaultVectorSizeBits);
     amdgpu::setIndices(shmWriteOp, transformedIndices);
   }
 
@@ -210,7 +209,8 @@ LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
     auto indices = amdgpu::getIndices(shmReadOp);
     SmallVector<Value, 4> transformedIndices(indices->begin(), indices->end());
     transformIndices(builder, shmReadOp->getLoc(), transformedIndices,
-                     memRefType, srcDim, tgtDim);
+                     memRefType, srcDim, tgtDim, sharedMemoryLineSizeBytes,
+                     defaultVectorSizeBits);
     amdgpu::setIndices(shmReadOp, transformedIndices);
   }
 
@@ -218,7 +218,9 @@ LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
 }
 
 std::optional<LogicalResult>
-amdgpu::optimizeSharedMemoryReadsAndWritesOp(func::FuncOp funcOp) {
+amdgpu::optimizeSharedMemoryReadsAndWritesOp(func::FuncOp funcOp,
+                                             int64_t sharedMemoryLineSizeBytes,
+                                             int64_t defaultVectorSizeBits) {
   SmallVector<memref::AllocOp> shmAllocOps;
   funcOp.walk([&](memref::AllocOp allocOp) {
     if (!amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(allocOp.getType()))
@@ -226,8 +228,9 @@ amdgpu::optimizeSharedMemoryReadsAndWritesOp(func::FuncOp funcOp) {
     shmAllocOps.push_back(allocOp);
   });
   for (auto allocOp : shmAllocOps) {
-    if (failed(amdgpu::optimizeSharedMemoryReadsAndWrites(funcOp,
-                                                          allocOp.getMemref())))
+    if (failed(amdgpu::optimizeSharedMemoryReadsAndWrites(
+            funcOp, allocOp.getMemref(), sharedMemoryLineSizeBytes,
+            defaultVectorSizeBits)))
       return failure();
   }
   return success();
@@ -237,7 +240,8 @@ struct OptimizeSharedMemoryPass
     : public amdgpu::impl::OptimizeSharedMemoryBase<OptimizeSharedMemoryPass> {
 public:
   OptimizeSharedMemoryPass() = default;
-
+  OptimizeSharedMemoryPass(const OptimizeSharedMemoryOptions &options)
+      : OptimizeSharedMemoryBase(options) {}
   void runOnOperation() override {
     Operation *op = getOperation();
     SmallVector<memref::AllocOp> shmAllocOps;
@@ -248,8 +252,9 @@ public:
       shmAllocOps.push_back(allocOp);
     });
     for (auto allocOp : shmAllocOps) {
-      if (failed(optimizeSharedMemoryReadsAndWrites(getOperation(),
-                                                    allocOp.getMemref())))
+      if (failed(optimizeSharedMemoryReadsAndWrites(op, allocOp.getMemref(),
+                                                    sharedMemoryLineSizeBytes,
+                                                    defaultVectorSizeBits)))
         return;
     }
   }
diff --git a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
index 8deb8f028ba4..7f246daf99ff 100644
--- a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
@@ -261,68 +261,62 @@ struct BFloat16TruncFOpConverter : public OpRewritePattern<arith::TruncFOp> {
       return rewriter.notifyMatchFailure(op, "not a trunc of f32 to bf16.");
     }
 
-    Type i1Ty = b.getI1Type();
     Type i16Ty = b.getI16Type();
     Type i32Ty = b.getI32Type();
     Type f32Ty = b.getF32Type();
     if (auto shapedTy = dyn_cast<ShapedType>(operandTy)) {
-      i1Ty = shapedTy.clone(i1Ty);
       i16Ty = shapedTy.clone(i16Ty);
       i32Ty = shapedTy.clone(i32Ty);
       f32Ty = shapedTy.clone(f32Ty);
     }
 
-    Value bitcast = b.create<arith::BitcastOp>(i32Ty, operand);
-
-    Value c23 = createConst(op.getLoc(), i32Ty, 23, rewriter);
-    Value c31 = createConst(op.getLoc(), i32Ty, 31, rewriter);
-    Value c23Mask = createConst(op.getLoc(), i32Ty, (1 << 23) - 1, rewriter);
-    Value expMask =
-        createConst(op.getLoc(), i32Ty, ((1 << 8) - 1) << 23, rewriter);
-    Value expMax =
-        createConst(op.getLoc(), i32Ty, ((1 << 8) - 2) << 23, rewriter);
-
-    // Grab the sign bit.
-    Value sign = b.create<arith::ShRUIOp>(bitcast, c31);
-
-    // Our mantissa rounding value depends on the sign bit and the last
-    // truncated bit.
-    Value cManRound = createConst(op.getLoc(), i32Ty, (1 << 15), rewriter);
-    cManRound = b.create<arith::SubIOp>(cManRound, sign);
-
-    // Grab out the mantissa and directly apply rounding.
-    Value man = b.create<arith::AndIOp>(bitcast, c23Mask);
-    Value manRound = b.create<arith::AddIOp>(man, cManRound);
-
-    // Grab the overflow bit and shift right if we overflow.
-    Value roundBit = b.create<arith::ShRUIOp>(manRound, c23);
-    Value manNew = b.create<arith::ShRUIOp>(manRound, roundBit);
-
-    // Grab the exponent and round using the mantissa's carry bit.
-    Value exp = b.create<arith::AndIOp>(bitcast, expMask);
-    Value expCarry = b.create<arith::AddIOp>(exp, manRound);
-    expCarry = b.create<arith::AndIOp>(expCarry, expMask);
-
-    // If the exponent is saturated, we keep the max value.
-    Value expCmp =
-        b.create<arith::CmpIOp>(arith::CmpIPredicate::uge, exp, expMax);
-    exp = b.create<arith::SelectOp>(expCmp, exp, expCarry);
-
-    // If the exponent is max and we rolled over, keep the old mantissa.
-    Value roundBitBool = b.create<arith::TruncIOp>(i1Ty, roundBit);
-    Value keepOldMan = b.create<arith::AndIOp>(expCmp, roundBitBool);
-    man = b.create<arith::SelectOp>(keepOldMan, man, manNew);
-
-    // Assemble the now rounded f32 value (as an i32).
-    Value rounded = b.create<arith::ShLIOp>(sign, c31);
-    rounded = b.create<arith::OrIOp>(rounded, exp);
-    rounded = b.create<arith::OrIOp>(rounded, man);
-
+    // Algorithm borrowed from this excellent code:
+    // https://github.com/pytorch/pytorch/blob/e1502c0cdbfd17548c612f25d5a65b1e4b86224d/c10/util/BFloat16.h#L60-L79
+    // There is a magic idea there, to let the addition of the rounding_bias to
+    // the mantissa simply overflow into the exponent bits. It's a bit of an
+    // aggressive, obfuscating optimization, but it is well-tested code, and it
+    // results in more concise and efficient IR.
+    // The case of NaN is handled separately (see isNaN and the final select).
+    // The case of infinities is NOT handled separately, which deserves an
+    // explanation. As the encoding of infinities has zero mantissa, the
+    // rounding-bias addition never carries into the exponent so that just gets
+    // truncated away, and as bfloat16 and float32 have the same number of
+    // exponent bits, that simple truncation is the desired outcome for
+    // infinities.
+    Value isNan =
+        b.create<arith::CmpFOp>(arith::CmpFPredicate::UNE, operand, operand);
+    // Constant used to make the rounding bias.
+    Value c7FFF = createConst(op.getLoc(), i32Ty, 0x7fff, rewriter);
+    // Constant used to generate a quiet NaN.
+    Value c7FC0_i16 = createConst(op.getLoc(), i16Ty, 0x7fc0, rewriter);
+    // Small constants used to address bits.
     Value c16 = createConst(op.getLoc(), i32Ty, 16, rewriter);
-    Value shr = b.create<arith::ShRUIOp>(rounded, c16);
-    Value trunc = b.create<arith::TruncIOp>(i16Ty, shr);
-    Value result = b.create<arith::BitcastOp>(resultTy, trunc);
-
+    Value c1 = createConst(op.getLoc(), i32Ty, 1, rewriter);
+    // Reinterpret the input f32 value as bits.
+    Value bitcast = b.create<arith::BitcastOp>(i32Ty, operand);
+    // Read bit 16 as a value in {0,1}.
+    Value bit16 =
+        b.create<arith::AndIOp>(b.create<arith::ShRUIOp>(bitcast, c16), c1);
+    // Determine the rounding bias to add as either 0x7fff or 0x8000 depending
+    // on bit 16, implementing the tie-breaking "to nearest even".
+    Value roundingBias = b.create<arith::AddIOp>(bit16, c7FFF);
+    // Add the rounding bias. Generally we want this to be added to the
+    // mantissa, but nothing prevents this to from carrying into the exponent
+    // bits, which would feel like a bug, but this is the magic trick here:
+    // when that happens, the mantissa gets reset to zero and the exponent
+    // gets incremented by the carry... which is actually exactly what we
+    // want.
+    Value biased = b.create<arith::AddIOp>(bitcast, roundingBias);
+    // Now that the rounding-bias has been added, truncating the low bits
+    // yields the correctly rounded result.
+    Value biasedAndShifted = b.create<arith::ShRUIOp>(biased, c16);
+    Value normalCaseResult_i16 =
+        b.create<arith::TruncIOp>(i16Ty, biasedAndShifted);
+    // Select either the above-computed result, or a quiet NaN constant
+    // if the input was NaN.
+    Value select =
+        b.create<arith::SelectOp>(isNan, c7FC0_i16, normalCaseResult_i16);
+    Value result = b.create<arith::BitcastOp>(resultTy, select);
     rewriter.replaceOp(op, result);
     return success();
   }
diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
index 935f0deaf9c8..db1974ddb377 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Config/mlir-config.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
@@ -38,7 +39,7 @@
 
 using namespace mlir;
 
-#if MLIR_CUDA_CONVERSIONS_ENABLED
+#if MLIR_ENABLE_CUDA_CONVERSIONS
 namespace {
 
 //===----------------------------------------------------------------------===//
@@ -127,4 +128,4 @@ void mlir::gpu::registerGPUToNVVMPipeline() {
       buildLowerToNVVMPassPipeline);
 }
 
-#endif // MLIR_CUDA_CONVERSIONS_ENABLED
+#endif // MLIR_ENABLE_CUDA_CONVERSIONS
diff --git a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
index 0527073da85b..f379ea819392 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
@@ -13,6 +13,7 @@
 
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 
+#include "mlir/Config/mlir-config.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -48,7 +49,7 @@ void GpuModuleToBinaryPass::getDependentDialects(
   // Register all GPU related translations.
   registry.insert<gpu::GPUDialect>();
   registry.insert<LLVM::LLVMDialect>();
-#if MLIR_CUDA_CONVERSIONS_ENABLED == 1
+#if MLIR_ENABLE_CUDA_CONVERSIONS
   registry.insert<NVVM::NVVMDialect>();
 #endif
 #if MLIR_ROCM_CONVERSIONS_ENABLED == 1
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 299965bcfc3a..ef9cd5561665 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -3245,6 +3245,31 @@ DiagnosedSilenceableFailure transform::ConvertConv2DToImg2ColOp::applyToOne(
 }
 
 //===----------------------------------------------------------------------===//
+// FlattenElementwiseLinalgOp.
+//===----------------------------------------------------------------------===//
+
+DiagnosedSilenceableFailure transform::FlattenElementwiseLinalgOp::applyToOne(
+    transform::TransformRewriter &rewriter, linalg::LinalgOp target,
+    transform::ApplyToEachResultList &results,
+    transform::TransformState &state) {
+  rewriter.setInsertionPoint(target);
+  if (target.getNumLoops() <= 1)
+    return DiagnosedSilenceableFailure::success();
+  ReassociationIndices reassociation(target.getNumLoops());
+  std::iota(reassociation.begin(), reassociation.end(), 0);
+  auto maybeFlattened =
+      (isElementwise(target))
+          ? collapseOpIterationDims(target, reassociation, rewriter)
+          : FailureOr<CollapseResult>(rewriter.notifyMatchFailure(
+                target, "only elementwise flattening is supported"));
+  if (failed(maybeFlattened))
+    return emitDefaultSilenceableFailure(target);
+  results.push_back(maybeFlattened->collapsedOp);
+  rewriter.replaceOp(target, maybeFlattened->results);
+  return DiagnosedSilenceableFailure::success();
+}
+
+//===----------------------------------------------------------------------===//
 // TransposeConv2DOp
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
index 4977940cfbd7..4797bfb2267d 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
@@ -1446,24 +1446,20 @@ void generateCollapsedIndexingRegion(Location loc, Block *block,
   }
 }
 
-template <typename LinalgType>
-Operation *createCollapsedOp(LinalgType op,
-                             const CollapsingInfo &collapsingInfo,
-                             RewriterBase &rewriter) {
-  static_assert(llvm::is_one_of<LinalgType, GenericOp, CopyOp>::value,
-                "unsupported linalg op type to create");
+void collapseOperandsAndResults(LinalgOp op,
+                                const CollapsingInfo &collapsingInfo,
+                                RewriterBase &rewriter,
+                                SmallVectorImpl<Value> &inputOperands,
+                                SmallVectorImpl<Value> &outputOperands,
+                                SmallVectorImpl<Type> &resultTypes) {
   Location loc = op->getLoc();
-
-  // Get the input operands.
-  SmallVector<Value> inputOperands =
+  inputOperands =
       llvm::map_to_vector(op.getDpsInputOperands(), [&](OpOperand *opOperand) {
         return getCollapsedOpOperand(loc, op, opOperand, collapsingInfo,
                                      rewriter);
       });
 
   // Get the output operands and result types.
-  SmallVector<Type> resultTypes;
-  SmallVector<Value> outputOperands;
   resultTypes.reserve(op.getNumDpsInits());
   outputOperands.reserve(op.getNumDpsInits());
   for (OpOperand &output : op.getDpsInitsMutable()) {
@@ -1475,41 +1471,69 @@ Operation *createCollapsedOp(LinalgType op,
     if (!op.hasPureBufferSemantics())
       resultTypes.push_back(newOutput.getType());
   }
+}
 
-  if (isa<linalg::CopyOp>(op)) {
-    return rewriter.create<linalg::CopyOp>(loc, inputOperands[0],
-                                           outputOperands[0]);
-  }
+/// Clone a `LinalgOp` to a collapsed version of same name
+template <typename OpTy>
+OpTy cloneToCollapsedOp(RewriterBase &rewriter, OpTy origOp,
+                        const CollapsingInfo &collapsingInfo) {
+  return nullptr;
+}
 
-  // Get the iterator types for the operand.
-  SmallVector<utils::IteratorType> iteratorTypes =
-      getCollapsedOpIteratorTypes(op.getIteratorTypesArray(), collapsingInfo);
+/// Collapse any `LinalgOp` that does not require any specialization such as
+/// indexing_maps, iterator_types, etc.
+template <>
+LinalgOp cloneToCollapsedOp<LinalgOp>(RewriterBase &rewriter, LinalgOp origOp,
+                                      const CollapsingInfo &collapsingInfo) {
+  SmallVector<Value> inputOperands, outputOperands;
+  SmallVector<Type> resultTypes;
+  collapseOperandsAndResults(origOp, collapsingInfo, rewriter, inputOperands,
+                             outputOperands, resultTypes);
+  return cast<LinalgOp>(clone(
+      rewriter, origOp, resultTypes,
+      llvm::to_vector(llvm::concat<Value>(inputOperands, outputOperands))));
+}
 
-  // Get the indexing maps.
-  auto indexingMaps =
-      llvm::map_to_vector(op.getIndexingMapsArray(), [&](AffineMap map) {
+/// Collapse a `GenericOp`
+template <>
+GenericOp cloneToCollapsedOp<GenericOp>(RewriterBase &rewriter,
+                                        GenericOp origOp,
+                                        const CollapsingInfo &collapsingInfo) {
+  SmallVector<Value> inputOperands, outputOperands;
+  SmallVector<Type> resultTypes;
+  collapseOperandsAndResults(origOp, collapsingInfo, rewriter, inputOperands,
+                             outputOperands, resultTypes);
+  SmallVector<AffineMap> indexingMaps(
+      llvm::map_range(origOp.getIndexingMapsArray(), [&](AffineMap map) {
         return getCollapsedOpIndexingMap(map, collapsingInfo);
-      });
+      }));
+
+  SmallVector<utils::IteratorType> iteratorTypes(getCollapsedOpIteratorTypes(
+      origOp.getIteratorTypesArray(), collapsingInfo));
 
-  Operation *collapsedOp = rewriter.create<linalg::GenericOp>(
-      loc, resultTypes, inputOperands, outputOperands, indexingMaps,
+  GenericOp collapsedOp = rewriter.create<linalg::GenericOp>(
+      origOp.getLoc(), resultTypes, inputOperands, outputOperands, indexingMaps,
       iteratorTypes, [](OpBuilder &builder, Location loc, ValueRange args) {});
-  Block *origOpBlock = &op->getRegion(0).front();
+  Block *origOpBlock = &origOp->getRegion(0).front();
   Block *collapsedOpBlock = &collapsedOp->getRegion(0).front();
   rewriter.mergeBlocks(origOpBlock, collapsedOpBlock,
                        collapsedOpBlock->getArguments());
-
   return collapsedOp;
 }
 
+LinalgOp createCollapsedOp(LinalgOp op, const CollapsingInfo &collapsingInfo,
+                           RewriterBase &rewriter) {
+  if (GenericOp genericOp = dyn_cast<GenericOp>(op.getOperation())) {
+    return cloneToCollapsedOp(rewriter, genericOp, collapsingInfo);
+  } else {
+    return cloneToCollapsedOp(rewriter, op, collapsingInfo);
+  }
+}
+
 /// Implementation of fusion with reshape operation by collapsing dimensions.
-template <typename LinalgType>
-FailureOr<SmallVector<Value>> mlir::linalg::collapseOpIterationDims(
-    LinalgType op, ArrayRef<ReassociationIndices> foldedIterationDims,
+FailureOr<CollapseResult> mlir::linalg::collapseOpIterationDims(
+    LinalgOp op, ArrayRef<ReassociationIndices> foldedIterationDims,
     RewriterBase &rewriter) {
-  static_assert(llvm::is_one_of<LinalgType, GenericOp, CopyOp>::value,
-                "unsupported linalg op type to collapse");
-
   // Bail on trivial no-op cases.
   if (op.getNumLoops() <= 1 || foldedIterationDims.empty() ||
       llvm::all_of(foldedIterationDims, [](ReassociationIndicesRef foldedDims) {
@@ -1538,8 +1562,7 @@ FailureOr<SmallVector<Value>> mlir::linalg::collapseOpIterationDims(
   }
 
   // Bail on non-canonical ranges.
-  SmallVector<Range> loopRanges =
-      cast<LinalgOp>(op.getOperation()).createLoopRanges(rewriter, op.getLoc());
+  SmallVector<Range> loopRanges = op.createLoopRanges(rewriter, op.getLoc());
   auto opFoldIsConstantValue = [](OpFoldResult ofr, int64_t value) {
     if (auto attr = llvm::dyn_cast_if_present<Attribute>(ofr))
       return cast<IntegerAttr>(attr).getInt() == value;
@@ -1555,8 +1578,7 @@ FailureOr<SmallVector<Value>> mlir::linalg::collapseOpIterationDims(
         op, "expected all loop ranges to have zero start and unit stride");
   }
 
-  LinalgType collapsedOp = cast<LinalgType>(
-      createCollapsedOp<LinalgType>(op, collapsingInfo, rewriter));
+  LinalgOp collapsedOp = createCollapsedOp(op, collapsingInfo, rewriter);
 
   Location loc = op->getLoc();
   if (collapsedOp.hasIndexSemantics()) {
@@ -1597,7 +1619,7 @@ FailureOr<SmallVector<Value>> mlir::linalg::collapseOpIterationDims(
       results.push_back(collapsedOpResult);
     }
   }
-  return results;
+  return CollapseResult{results, collapsedOp};
 }
 
 namespace {
@@ -1629,15 +1651,14 @@ public:
         continue;
       }
 
-      std::optional<SmallVector<Value>> replacements =
-          collapseOpIterationDims<linalg::GenericOp>(
-              genericOp, collapsableIterationDims, rewriter);
-      if (!replacements) {
+      std::optional<CollapseResult> collapseResult = collapseOpIterationDims(
+          genericOp, collapsableIterationDims, rewriter);
+      if (!collapseResult) {
         return rewriter.notifyMatchFailure(
             genericOp, "failed to do the fusion by collapsing transformation");
       }
 
-      rewriter.replaceOp(genericOp, *replacements);
+      rewriter.replaceOp(genericOp, collapseResult->results);
       return success();
     }
     return failure();
@@ -1671,13 +1692,12 @@ public:
           op, "specified dimensions cannot be collapsed");
     }
 
-    std::optional<SmallVector<Value>> replacements =
-        collapseOpIterationDims<LinalgType>(op, collapsableIterationDims,
-                                            rewriter);
-    if (!replacements) {
+    std::optional<CollapseResult> collapseResult =
+        collapseOpIterationDims(op, collapsableIterationDims, rewriter);
+    if (!collapseResult) {
       return rewriter.notifyMatchFailure(op, "failed to collapse dimensions");
     }
-    rewriter.replaceOp(op, *replacements);
+    rewriter.replaceOp(op, collapseResult->results);
     return success();
   }
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index ac043e87223d..1e703dacfd0c 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -891,8 +891,7 @@ static bool isContiguousLoadIdx(LinalgOp &linalgOp, Value &val,
 
   // Conservatively reject Ops that could lead to indices with stride other
   // than 1.
-  if (!isa<arith::AddIOp, arith::SubIOp, arith::ConstantOp, linalg::IndexOp>(
-          ancestor))
+  if (!isa<arith::AddIOp, arith::ConstantOp, linalg::IndexOp>(ancestor))
     return false;
 
   bool result = false;
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index c2b471ab9618..8a6980e2c6a2 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -1957,7 +1957,10 @@ LogicalResult PrivateClauseOp::verify() {
   Type symType = getType();
 
   auto verifyTerminator = [&](Operation *terminator) -> LogicalResult {
-    if (!terminator->hasSuccessors() && !llvm::isa<YieldOp>(terminator))
+    if (!terminator->getBlock()->getSuccessors().empty())
+      return success();
+
+    if (!llvm::isa<YieldOp>(terminator))
       return mlir::emitError(terminator->getLoc())
              << "expected exit block terminator to be an `omp.yield` op.";
 
diff --git a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
index 455e90baf0a7..92e5efaa8104 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
@@ -62,6 +62,8 @@ FailureOr<uint64_t> LvlTypeParser::parseLvlType(AsmParser &parser) const {
   // Set the base bit for properties.
   if (base.compare("dense") == 0) {
     properties |= static_cast<uint64_t>(LevelFormat::Dense);
+  } else if (base.compare("batch") == 0) {
+    properties |= static_cast<uint64_t>(LevelFormat::Batch);
   } else if (base.compare("compressed") == 0) {
     properties |= static_cast<uint64_t>(LevelFormat::Compressed);
   } else if (base.compare("structured") == 0) {
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index af7b85d45877..69c3413f35ea 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -126,13 +126,16 @@ void sparse_tensor::foreachFieldAndTypeInSparseTensor(
   const Type posType = stt.getPosType();
   const Type eltType = stt.getElementType();
 
+  SmallVector<int64_t> memrefShape = stt.getBatchLvlShape();
+  memrefShape.push_back(ShapedType::kDynamic);
+
   const Type specType = StorageSpecifierType::get(stt.getEncoding());
-  // memref<? x pos>  positions
-  const Type posMemType = MemRefType::get({ShapedType::kDynamic}, posType);
-  // memref<? x crd>  coordinates
-  const Type crdMemType = MemRefType::get({ShapedType::kDynamic}, crdType);
-  // memref<? x eltType> values
-  const Type valMemType = MemRefType::get({ShapedType::kDynamic}, eltType);
+  // memref<[batch] x ? x pos>  positions
+  const Type posMemType = MemRefType::get(memrefShape, posType);
+  // memref<[batch] x ? x crd>  coordinates
+  const Type crdMemType = MemRefType::get(memrefShape, crdType);
+  // memref<[batch] x ? x eltType> values
+  const Type valMemType = MemRefType::get(memrefShape, eltType);
 
   StorageLayout(stt).foreachField([specType, posMemType, crdMemType, valMemType,
                                    callback](FieldIndex fieldIdx,
@@ -336,6 +339,12 @@ SparseTensorEncodingAttr SparseTensorEncodingAttr::withoutDimSlices() const {
   return withDimSlices(ArrayRef<SparseTensorDimSliceAttr>{});
 }
 
+uint64_t SparseTensorEncodingAttr::getBatchLvlRank() const {
+  ArrayRef<LevelType> lvlTypes = getLvlTypes();
+  auto lastBatch = std::find_if(lvlTypes.rbegin(), lvlTypes.rend(), isBatchLT);
+  return std::distance(lastBatch, lvlTypes.rend());
+}
+
 bool SparseTensorEncodingAttr::isAllDense() const {
   return !getImpl() || llvm::all_of(getLvlTypes(), isDenseLT);
 }
@@ -690,6 +699,10 @@ LogicalResult SparseTensorEncodingAttr::verify(
     }
   }
 
+  auto lastBatch = std::find_if(lvlTypes.rbegin(), lvlTypes.rend(), isBatchLT);
+  if (!std::all_of(lastBatch, lvlTypes.rend(), isBatchLT))
+    return emitError() << "Batch lvlType can only be leading levels.";
+
   // SoA property can only be applied on singleton level.
   auto soaLvls = llvm::make_filter_range(lvlTypes, [](LevelType lt) {
     return lt.isa<LevelPropNonDefault::SoA>();
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
index 0ccb11f3a6b8..d5eec4ae67e7 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
@@ -1293,7 +1293,7 @@ struct SparseAssembleOpConverter : public OpConversionPattern<AssembleOp> {
             Value tensor = fKind == SparseTensorFieldKind::ValMemRef
                                ? op.getValues()
                                : op.getLevels()[fIdx];
-
+            // TODO: handle batch.
             TypedValue<BaseMemRefType> mem = genToMemref(rewriter, loc, tensor);
             if (mem.getType().getRank() > 1) {
               // Flattens the buffer to rank 1.
@@ -1322,9 +1322,8 @@ struct SparseAssembleOpConverter : public OpConversionPattern<AssembleOp> {
     for (Level lvl = 0, lvlRank = stt.getLvlRank(); lvl < lvlRank; lvl++) {
       assert(!ShapedType::isDynamic(stt.getDimShape()[lvl]));
 
-      // FIXME: dim/lvl confusion!
       // Sets up the level size.
-      auto lvlSize = constantIndex(rewriter, loc, stt.getDimShape()[lvl]);
+      auto lvlSize = constantIndex(rewriter, loc, stt.getLvlShape()[lvl]);
       desc.setLvlSize(rewriter, loc, lvl, lvlSize);
       // We use a single AOS array to store the trailing COO, so there is only
       // one memory size to set for the entire COO section.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp
index 61a3703b73bf..011d814cd900 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp
@@ -1278,6 +1278,8 @@ sparse_tensor::makeSparseTensorLevel(OpBuilder &b, Location l, Value t,
   switch (lt.getLvlFmt()) {
   case LevelFormat::Dense:
     return std::make_unique<DenseLevel>(tid, lvl, sz, stt.hasEncoding());
+  case LevelFormat::Batch:
+    llvm_unreachable("not implemented");
   case LevelFormat::Compressed: {
     Value pos = genToPositions(b, l, t, lvl);
     Value crd = genToCoordinates(b, l, t, lvl);
diff --git a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
index 731cd79a1e3b..72b722c69ae3 100644
--- a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
@@ -476,7 +476,7 @@ BitVector Merger::simplifyCond(LatSetId s0, LatPointId p0) {
     // Starts resetting from a dense level, so that the first bit (if kept)
     // is not undefined level-type.
     for (unsigned b = 0; b < be; b++) {
-      if (simple[b] && isDenseLT(getLvlType(TensorLoopId{b}))) {
+      if (simple[b] && getLvlType(TensorLoopId{b}).hasDenseSemantic()) {
         offset = be - b - 1; // relative to the end
         break;
       }
@@ -489,8 +489,7 @@ BitVector Merger::simplifyCond(LatSetId s0, LatPointId p0) {
     // Slice on dense level has `locate` property as well, and can be optimized.
     if (simple[b] && !isSparseLvlWithNonTrivialIdxExp(b)) {
       const auto lt = getLvlType(b);
-      if (!isCompressedLT(lt) && !isSingletonLT(lt) &&
-          !isLooseCompressedLT(lt) && !isNOutOfMLT(lt)) {
+      if (!lt.hasSparseSemantic()) {
         if (reset)
           simple.reset(b);
         reset = true;
@@ -670,8 +669,7 @@ bool Merger::isSingleCondition(TensorId t, ExprId e) const {
 bool Merger::hasAnySparse(const BitVector &bits) const {
   for (TensorLoopId b : bits.set_bits()) {
     const auto lt = getLvlType(b);
-    if (isCompressedLT(lt) || isSingletonLT(lt) || isLooseCompressedLT(lt) ||
-        isNOutOfMLT(lt))
+    if (lt.hasSparseSemantic())
       return true;
   }
   return hasSparseIdxReduction(bits);
diff --git a/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp b/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp
index 5073234a7e35..7adf223f3440 100644
--- a/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp
+++ b/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp
@@ -50,12 +50,79 @@ static Operation *findPayloadRoot(Operation *passRoot, StringRef tag) {
     return WalkResult::interrupt();
   });
 
+  if (!target) {
+    passRoot->emitError()
+        << "could not find the operation with transform.target_tag=\"" << tag
+        << "\" attribute";
+    return nullptr;
+  }
+
   return walkResult.wasInterrupted() ? nullptr : target;
 }
 
 namespace {
 class InterpreterPass
     : public transform::impl::InterpreterPassBase<InterpreterPass> {
+  // Parses the pass arguments to bind trailing arguments of the entry point.
+  std::optional<RaggedArray<transform::MappedValue>>
+  parseArguments(Operation *payloadRoot) {
+    MLIRContext *context = payloadRoot->getContext();
+
+    SmallVector<SmallVector<transform::MappedValue>, 2> trailingBindings;
+    trailingBindings.resize(debugBindTrailingArgs.size());
+
+    // Construct lists of op names to match.
+    SmallVector<std::optional<OperationName>> debugBindNames;
+    debugBindNames.reserve(debugBindTrailingArgs.size());
+    for (auto &&[position, nameString] :
+         llvm::enumerate(debugBindTrailingArgs)) {
+      StringRef name = nameString;
+
+      // Parse the integer literals.
+      if (name.starts_with("#")) {
+        debugBindNames.push_back(std::nullopt);
+        StringRef lhs = "";
+        StringRef rhs = name.drop_front();
+        do {
+          std::tie(lhs, rhs) = rhs.split(';');
+          int64_t value;
+          if (lhs.getAsInteger(10, value)) {
+            emitError(UnknownLoc::get(context))
+                << "couldn't parse integer pass argument " << name;
+            return std::nullopt;
+          }
+          trailingBindings[position].push_back(
+              Builder(context).getI64IntegerAttr(value));
+        } while (!rhs.empty());
+      } else if (name.starts_with("^")) {
+        debugBindNames.emplace_back(OperationName(name.drop_front(), context));
+      } else {
+        debugBindNames.emplace_back(OperationName(name, context));
+      }
+    }
+
+    // Collect operations or results for extra bindings.
+    payloadRoot->walk([&](Operation *payload) {
+      for (auto &&[position, name] : llvm::enumerate(debugBindNames)) {
+        if (!name || payload->getName() != *name)
+          continue;
+
+        if (StringRef(*std::next(debugBindTrailingArgs.begin(), position))
+                .starts_with("^")) {
+          llvm::append_range(trailingBindings[position], payload->getResults());
+        } else {
+          trailingBindings[position].push_back(payload);
+        }
+      }
+    });
+
+    RaggedArray<transform::MappedValue> bindings;
+    bindings.push_back(ArrayRef<Operation *>{payloadRoot});
+    for (SmallVector<transform::MappedValue> &trailing : trailingBindings)
+      bindings.push_back(std::move(trailing));
+    return bindings;
+  }
+
 public:
   using Base::Base;
 
@@ -67,34 +134,18 @@ public:
         findPayloadRoot(getOperation(), debugPayloadRootTag);
     if (!payloadRoot)
       return signalPassFailure();
-    auto debugBindNames = llvm::map_to_vector(
-        debugBindTrailingArgs,
-        [&](const std::string &name) { return OperationName(name, context); });
-    SmallVector<SmallVector<Operation *>, 2> trailingBindings;
-    trailingBindings.resize(debugBindNames.size());
-    payloadRoot->walk([&](Operation *payload) {
-      for (auto &&[position, name] : llvm::enumerate(debugBindNames)) {
-        if (payload->getName() == name)
-          trailingBindings[position].push_back(payload);
-      }
-    });
 
     Operation *transformEntryPoint = transform::detail::findTransformEntryPoint(
         getOperation(), transformModule, entryPoint);
-    if (!transformEntryPoint) {
-      getOperation()->emitError()
-          << "could not find transform entry point: " << entryPoint
-          << " in either payload or transform module";
+    if (!transformEntryPoint)
       return signalPassFailure();
-    }
-
-    RaggedArray<transform::MappedValue> bindings;
-    bindings.push_back(ArrayRef<Operation *>{payloadRoot});
-    for (SmallVector<Operation *> &trailing : trailingBindings)
-      bindings.push_back(std::move(trailing));
 
+    std::optional<RaggedArray<transform::MappedValue>> bindings =
+        parseArguments(payloadRoot);
+    if (!bindings)
+      return signalPassFailure();
     if (failed(transform::applyTransformNamedSequence(
-            bindings,
+            *bindings,
             cast<transform::TransformOpInterface>(transformEntryPoint),
             transformModule,
             options.enableExpensiveChecks(!disableExpensiveChecks)))) {
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index 620ceee48b19..b3ab4a916121 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -443,15 +443,24 @@ static vector::TransferWriteOp cloneWriteOp(RewriterBase &rewriter,
 /// d1) and return vector<16x2x64>
 static VectorType getDistributedType(VectorType originalType, AffineMap map,
                                      int64_t warpSize) {
-  if (map.getNumResults() != 1)
-    return VectorType();
   SmallVector<int64_t> targetShape(originalType.getShape().begin(),
                                    originalType.getShape().end());
   for (unsigned i = 0, e = map.getNumResults(); i < e; i++) {
     unsigned position = map.getDimPosition(i);
-    if (targetShape[position] % warpSize != 0)
-      return VectorType();
+    if (targetShape[position] % warpSize != 0) {
+      if (warpSize % targetShape[position] != 0) {
+        return VectorType();
+      }
+      warpSize /= targetShape[position];
+      targetShape[position] = 1;
+      continue;
+    }
     targetShape[position] = targetShape[position] / warpSize;
+    warpSize = 1;
+    break;
+  }
+  if (warpSize != 1) {
+    return VectorType();
   }
   VectorType targetType =
       VectorType::get(targetShape, originalType.getElementType());
@@ -526,7 +535,30 @@ struct WarpOpTransferWrite : public OpRewritePattern<WarpExecuteOnLane0Op> {
     // 4. Reindex the write using the distribution map.
     auto newWarpOp =
         newWriteOp.getVector().getDefiningOp<WarpExecuteOnLane0Op>();
+
+    // Delinearize the lane id based on the way threads are divided across the
+    // vector. To get the number of threads per vector dimension, divide the
+    // sequential size by the distributed size along each dim.
     rewriter.setInsertionPoint(newWriteOp);
+    SmallVector<OpFoldResult> delinearizedIdSizes;
+    for (auto [seqSize, distSize] :
+         llvm::zip_equal(writtenVectorType.getShape(), targetType.getShape())) {
+      assert(seqSize % distSize == 0 && "Invalid distributed vector shape");
+      delinearizedIdSizes.push_back(rewriter.getIndexAttr(seqSize / distSize));
+    }
+    SmallVector<Value> delinearized;
+    if (map.getNumResults() > 1) {
+      delinearized = rewriter
+                         .create<mlir::affine::AffineDelinearizeIndexOp>(
+                             newWarpOp.getLoc(), newWarpOp.getLaneid(),
+                             delinearizedIdSizes)
+                         .getResults();
+    } else {
+      // If there is only one map result, we can elide the delinearization
+      // op and use the lane id directly.
+      delinearized.append(targetType.getRank(), newWarpOp.getLaneid());
+    }
+
     AffineMap indexMap = map.compose(newWriteOp.getPermutationMap());
     Location loc = newWriteOp.getLoc();
     SmallVector<Value> indices(newWriteOp.getIndices().begin(),
@@ -539,11 +571,11 @@ struct WarpOpTransferWrite : public OpRewritePattern<WarpExecuteOnLane0Op> {
         continue;
       unsigned indexPos = indexExpr.getPosition();
       unsigned vectorPos = cast<AffineDimExpr>(std::get<1>(it)).getPosition();
+      Value laneId = delinearized[vectorPos];
       auto scale =
           rewriter.getAffineConstantExpr(targetType.getDimSize(vectorPos));
       indices[indexPos] = affine::makeComposedAffineApply(
-          rewriter, loc, d0 + scale * d1,
-          {indices[indexPos], newWarpOp.getLaneid()});
+          rewriter, loc, d0 + scale * d1, {indices[indexPos], laneId});
     }
     newWriteOp.getIndicesMutable().assign(indices);
 
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index fc11ae63e718..dc6f126aae4c 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -729,8 +729,8 @@ static LogicalResult commonConversionPrecondition(PatternRewriter &rewriter,
 
   // TODO: consider relaxing this restriction in the future if we find ways
   // to really work with subbyte elements across the MLIR/LLVM boundary.
-  unsigned resultBitwidth = preconditionType.getElementTypeBitWidth();
-  if (resultBitwidth % 8 != 0)
+  unsigned bitwidth = preconditionType.getElementTypeBitWidth();
+  if (bitwidth % 8 != 0)
     return rewriter.notifyMatchFailure(op, "bitwidth is not k * 8");
 
   return success();
@@ -768,6 +768,10 @@ static LogicalResult alignedConversionPrecondition(PatternRewriter &rewriter,
       (dstElemBitwidth % srcElemBitwidth) != 0)
     return rewriter.notifyMatchFailure(op, "Not a supported aligned case");
 
+  if ((srcType.getShape().back() % 2) != 0)
+    return rewriter.notifyMatchFailure(
+        op, "Not an even number of i4 elements in trailing dim");
+
   return success();
 }
 
@@ -876,6 +880,58 @@ static Value rewriteI4ToI8SignedExt(PatternRewriter &rewriter, Location loc,
   return rewriter.create<vector::InterleaveOp>(loc, low, high);
 }
 
+/// Rewrite the i8 -> i4 truncation into a sequence of shuffles and bitwise ops
+/// that take advantage of high-level information to avoid leaving LLVM to
+/// scramble with peephole optimizations.
+static Value rewriteI8ToI4Trunc(PatternRewriter &rewriter, Location loc,
+                                Value srcValue) {
+  VectorType srcVecType = cast<VectorType>(srcValue.getType());
+  assert(srcVecType.getElementType().isSignlessInteger(8) &&
+         "Expected i8 type");
+
+  // 1. De-interleave low and high i8 elements.
+  int64_t vecDimSize = srcVecType.getShape().back();
+  SmallVector<int64_t> deinterleaveLowMaskValues;
+  SmallVector<int64_t> deinterleaveHighMaskValues;
+  assert((vecDimSize % 2) == 0 && "Odd number of i4 elements");
+  deinterleaveLowMaskValues.reserve(vecDimSize / 2);
+  deinterleaveHighMaskValues.reserve(vecDimSize / 2);
+  for (int i = 0, end = vecDimSize; i < end; i += 2) {
+    deinterleaveLowMaskValues.push_back(i);
+    deinterleaveHighMaskValues.push_back(i + 1);
+  }
+
+  auto lowShuffleOp = rewriter.create<vector::ShuffleOp>(
+      loc, srcValue, srcValue,
+      rewriter.getI64ArrayAttr(deinterleaveLowMaskValues));
+  auto highShuffleOp = rewriter.create<vector::ShuffleOp>(
+      loc, srcValue, srcValue,
+      rewriter.getI64ArrayAttr(deinterleaveHighMaskValues));
+
+  // 2. Zero out the upper side of each low i8 element.
+  constexpr int8_t i8LowBitMask = 0x0F;
+  Value zeroOutMask = rewriter.create<arith::ConstantOp>(
+      loc,
+      DenseElementsAttr::get(lowShuffleOp.getResultVectorType(), i8LowBitMask));
+  Value zeroOutLow =
+      rewriter.create<arith::AndIOp>(loc, lowShuffleOp, zeroOutMask);
+
+  // 3. Move high i4 values to upper side of the byte.
+  constexpr int8_t bitsToShift = 4;
+  VectorType deinterI8VecType = highShuffleOp.getResultVectorType();
+  auto shiftValues = rewriter.create<arith::ConstantOp>(
+      loc, DenseElementsAttr::get(deinterI8VecType, bitsToShift));
+  Value shlHigh =
+      rewriter.create<arith::ShLIOp>(loc, highShuffleOp, shiftValues);
+
+  // 4. Merge high and low i4 values.
+  auto mergedHiLowOp = rewriter.create<arith::OrIOp>(loc, zeroOutLow, shlHigh);
+
+  // 5. Generate a bitcast vector<Xxi8> -> vector<2Xxi4>.
+  auto i4VecType = srcVecType.cloneWith(std::nullopt, rewriter.getI4Type());
+  return rewriter.create<vector::BitCastOp>(loc, i4VecType, mergedHiLowOp);
+}
+
 namespace {
 /// Rewrite bitcast(trunci) to a sequence of shuffles and bitwise ops that take
 /// advantage of high-level information to avoid leaving LLVM to scramble with
@@ -1019,7 +1075,7 @@ struct RewriteAlignedSubByteIntSignedExt : OpRewritePattern<ConversionOpType> {
 
   LogicalResult matchAndRewrite(ConversionOpType conversionOp,
                                 PatternRewriter &rewriter) const override {
-    // Set up the BitCastRewriter and verify the preconditions.
+    // Verify the preconditions.
     Value srcValue = conversionOp.getIn();
     auto srcVecType = dyn_cast<VectorType>(srcValue.getType());
     auto dstVecType = dyn_cast<VectorType>(conversionOp.getType());
@@ -1043,6 +1099,65 @@ struct RewriteAlignedSubByteIntSignedExt : OpRewritePattern<ConversionOpType> {
   }
 };
 
+/// Rewrite the i8 -> i4 part of any truncation into a sequence of shuffles and
+/// bitwise ops that take advantage of high-level information to avoid leaving
+/// LLVM to scramble with peephole optimizations.
+///
+/// For example:
+///    arith.trunci %in : vector<8xi32> to vector<8xi4>
+///      is rewriten as
+///
+///        %cst = arith.constant dense<15> : vector<4xi8>
+///        %cst_0 = arith.constant dense<4> : vector<4xi8>
+///        %0 = arith.trunci %in : vector<8xi32> to vector<8xi8>
+///        %1 = vector.shuffle %0, %0 [0, 2, 4, 6] : vector<8xi8>, vector<8xi8>
+///        %2 = vector.shuffle %0, %0 [1, 3, 5, 7] : vector<8xi8>, vector<8xi8>
+///        %3 = arith.andi %1, %cst : vector<4xi8>
+///        %4 = arith.shli %2, %cst_0 : vector<4xi8>
+///        %5 = arith.ori %3, %4 : vector<4xi8>
+///        %6 = vector.bitcast %5 : vector<4xi8> to vector<8xi4>
+///
+struct RewriteAlignedSubByteIntTrunc : OpRewritePattern<arith::TruncIOp> {
+  using OpRewritePattern<arith::TruncIOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::TruncIOp truncOp,
+                                PatternRewriter &rewriter) const override {
+    // Verify the preconditions.
+    Value srcValue = truncOp.getIn();
+    auto srcVecType = dyn_cast<VectorType>(srcValue.getType());
+    auto dstVecType = dyn_cast<VectorType>(truncOp.getType());
+    if (!srcVecType || !dstVecType)
+      return failure();
+
+    // Only single dim vectors are supported until we have
+    // `vector.deinterleave`.
+    if (srcVecType.getRank() != 1)
+      return failure();
+
+    if (failed(commonConversionPrecondition(rewriter, srcVecType, truncOp)))
+      return failure();
+
+    // Check general alignment preconditions. We invert the src/dst type order
+    // to reuse the existing precondition logic.
+    if (failed(alignedConversionPrecondition(rewriter, dstVecType, srcVecType,
+                                             truncOp)))
+      return failure();
+
+    // Create a new iX -> i8 truncation op.
+    Location loc = truncOp.getLoc();
+    auto i8VecType = srcVecType.cloneWith(std::nullopt, rewriter.getI8Type());
+    Value i8TruncVal =
+        rewriter.create<arith::TruncIOp>(loc, i8VecType, srcValue);
+
+    // Rewrite the i8 -> i4 truncation part.
+    Value subByteTrunc = rewriteI8ToI4Trunc(rewriter, loc, i8TruncVal);
+
+    // Finalize the rewrite.
+    rewriter.replaceOp(truncOp, subByteTrunc);
+    return success();
+  }
+};
+
 /// Rewrite a sub-byte vector transpose into a sequence of instructions that
 /// perform the transpose on wider (byte) element types.
 /// For example:
@@ -1115,8 +1230,9 @@ void vector::populateVectorNarrowTypeRewritePatterns(
   // Patterns for aligned cases. We set higher priority as they are expected to
   // generate better performance for aligned cases.
   patterns.add<RewriteAlignedSubByteIntSignedExt<arith::ExtSIOp>,
-               RewriteAlignedSubByteIntSignedExt<arith::SIToFPOp>>(
-      patterns.getContext(), benefit.getBenefit() + 1);
+               RewriteAlignedSubByteIntSignedExt<arith::SIToFPOp>,
+               RewriteAlignedSubByteIntTrunc>(patterns.getContext(),
+                                              benefit.getBenefit() + 1);
 }
 
 void vector::populateVectorTransposeNarrowTypeRewritePatterns(
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index 74dd1dfaca0d..a2d4e2166331 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -713,6 +713,76 @@ struct BubbleDownBitCastForStridedSliceExtract
 // Shuffles vector.bitcast op before vector.insert_strided_slice op.
 //
 // This transforms IR like:
+//   %0 = vector.insert %val, %dst[4] : vector<32xi4> into vector<8x32xi4>
+//   %1 = vector.bitcast %0 : vector<8x32xi4> to vector<8x16xi8>
+// Into:
+//   %0 = vector.bitcast %val : vector<32xi4> to vector<16xi8>
+//   %1 = vector.bitcast %dst : vector<8x32xi4> to vector<8x16xi8>
+//   %2 = vector.insert %0, %1 [4] : vector<16xi8> into vector<8x16xi8>
+//
+struct BubbleUpBitCastForInsert : public OpRewritePattern<vector::BitCastOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::BitCastOp bitcastOp,
+                                PatternRewriter &rewriter) const override {
+    VectorType castSrcType = bitcastOp.getSourceVectorType();
+    VectorType castDstType = bitcastOp.getResultVectorType();
+
+    // 0-D and scalable vectors are not supported yet.
+    if (castSrcType.getRank() == 0 || castSrcType.isScalable() ||
+        castDstType.isScalable())
+      return failure();
+
+    int64_t castSrcLastDim = castSrcType.getShape().back();
+    int64_t castDstLastDim = castDstType.getShape().back();
+    bool isNumElemsShrink = castSrcLastDim >= castDstLastDim;
+    int64_t ratio;
+    if (isNumElemsShrink) {
+      assert(castSrcLastDim % castDstLastDim == 0);
+      ratio = castSrcLastDim / castDstLastDim;
+    } else {
+      assert(castDstLastDim % castSrcLastDim == 0);
+      ratio = castDstLastDim / castSrcLastDim;
+    }
+
+    auto insertOp = bitcastOp.getSource().getDefiningOp<vector::InsertOp>();
+    if (!insertOp)
+      return failure();
+
+    // Only vector sources are supported for now.
+    auto insertSrcType = dyn_cast<VectorType>(insertOp.getSourceType());
+    if (!insertSrcType)
+      return failure();
+
+    // Bitcast the source.
+    SmallVector<int64_t> srcDims(insertSrcType.getShape());
+    srcDims.back() =
+        isNumElemsShrink ? srcDims.back() / ratio : srcDims.back() * ratio;
+    VectorType newCastSrcType =
+        VectorType::get(srcDims, castDstType.getElementType());
+    auto newCastSrcOp = rewriter.create<vector::BitCastOp>(
+        bitcastOp.getLoc(), newCastSrcType, insertOp.getSource());
+
+    SmallVector<int64_t> dstDims(insertOp.getDestVectorType().getShape());
+    dstDims.back() =
+        isNumElemsShrink ? dstDims.back() / ratio : dstDims.back() * ratio;
+    VectorType newCastDstType =
+        VectorType::get(dstDims, castDstType.getElementType());
+
+    // Bitcast the destination.
+    auto newCastDstOp = rewriter.create<vector::BitCastOp>(
+        bitcastOp.getLoc(), newCastDstType, insertOp.getDest());
+
+    // Generate new insert.
+    rewriter.replaceOpWithNewOp<vector::InsertOp>(
+        bitcastOp, newCastSrcOp, newCastDstOp, insertOp.getMixedPosition());
+    return success();
+  }
+};
+
+// Shuffles vector.bitcast op before vector.insert_strided_slice op.
+//
+// This transforms IR like:
 //   %0 = vector.insert_strided_slice %src, %dst {
 //          offsets = [0], strides = [1]} : vector<4xf16> into vector<8xf16>
 //   %1 = vector.bitcast %0: vector<8xf16> to vector<4xf32>
@@ -1782,8 +1852,8 @@ void mlir::vector::populateBubbleVectorBitCastOpPatterns(
     RewritePatternSet &patterns, PatternBenefit benefit) {
   patterns.add<BubbleDownVectorBitCastForExtract,
                BubbleDownBitCastForStridedSliceExtract,
-               BubbleUpBitCastForStridedSliceInsert>(patterns.getContext(),
-                                                     benefit);
+               BubbleUpBitCastForInsert, BubbleUpBitCastForStridedSliceInsert>(
+      patterns.getContext(), benefit);
 }
 
 void mlir::vector::populateBreakDownVectorBitCastOpPatterns(
diff --git a/mlir/lib/Target/LLVM/NVVM/Target.cpp b/mlir/lib/Target/LLVM/NVVM/Target.cpp
index 71b15a92782e..d5b6645631ed 100644
--- a/mlir/lib/Target/LLVM/NVVM/Target.cpp
+++ b/mlir/lib/Target/LLVM/NVVM/Target.cpp
@@ -13,6 +13,7 @@
 
 #include "mlir/Target/LLVM/NVVM/Target.h"
 
+#include "mlir/Config/mlir-config.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Target/LLVM/NVVM/Utils.h"
@@ -156,7 +157,7 @@ SerializeGPUModuleBase::loadBitcodeFiles(llvm::Module &module) {
   return std::move(bcFiles);
 }
 
-#if MLIR_CUDA_CONVERSIONS_ENABLED == 1
+#if MLIR_ENABLE_CUDA_CONVERSIONS
 namespace {
 class NVPTXSerializer : public SerializeGPUModuleBase {
 public:
@@ -562,7 +563,7 @@ NVPTXSerializer::moduleToObject(llvm::Module &llvmModule) {
   return compileToBinary(*serializedISA);
 #endif // MLIR_NVPTXCOMPILER_ENABLED == 1
 }
-#endif // MLIR_CUDA_CONVERSIONS_ENABLED == 1
+#endif // MLIR_ENABLE_CUDA_CONVERSIONS
 
 std::optional<SmallVector<char, 0>>
 NVVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
@@ -574,7 +575,7 @@ NVVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
     module->emitError("Module must be a GPU module.");
     return std::nullopt;
   }
-#if MLIR_CUDA_CONVERSIONS_ENABLED == 1
+#if MLIR_ENABLE_CUDA_CONVERSIONS
   NVPTXSerializer serializer(*module, cast<NVVMTargetAttr>(attribute), options);
   serializer.init();
   return serializer.run();
@@ -582,7 +583,7 @@ NVVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
   module->emitError(
       "The `NVPTX` target was not built. Please enable it when building LLVM.");
   return std::nullopt;
-#endif // MLIR_CUDA_CONVERSIONS_ENABLED == 1
+#endif // MLIR_ENABLE_CUDA_CONVERSIONS
 }
 
 Attribute
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 6e53d801a0d2..fd1de274da60 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -396,9 +396,9 @@ collectReductionDecls(T loop,
 
 /// Translates the blocks contained in the given region and appends them to at
 /// the current insertion point of `builder`. The operations of the entry block
-/// are appended to the current insertion block, which is not expected to have a
-/// terminator. If set, `continuationBlockArgs` is populated with translated
-/// values that correspond to the values omp.yield'ed from the region.
+/// are appended to the current insertion block. If set, `continuationBlockArgs`
+/// is populated with translated values that correspond to the values
+/// omp.yield'ed from the region.
 static LogicalResult inlineConvertOmpRegions(
     Region &region, StringRef blockName, llvm::IRBuilderBase &builder,
     LLVM::ModuleTranslation &moduleTranslation,
@@ -409,7 +409,14 @@ static LogicalResult inlineConvertOmpRegions(
   // Special case for single-block regions that don't create additional blocks:
   // insert operations without creating additional blocks.
   if (llvm::hasSingleElement(region)) {
+    llvm::Instruction *potentialTerminator =
+        builder.GetInsertBlock()->empty() ? nullptr
+                                          : &builder.GetInsertBlock()->back();
+
+    if (potentialTerminator && potentialTerminator->isTerminator())
+      potentialTerminator->removeFromParent();
     moduleTranslation.mapBlock(&region.front(), builder.GetInsertBlock());
+
     if (failed(moduleTranslation.convertBlock(
             region.front(), /*ignoreArguments=*/true, builder)))
       return failure();
@@ -423,6 +430,10 @@ static LogicalResult inlineConvertOmpRegions(
     // Drop the mapping that is no longer necessary so that the same region can
     // be processed multiple times.
     moduleTranslation.forgetMapping(region);
+
+    if (potentialTerminator && potentialTerminator->isTerminator())
+      potentialTerminator->insertAfter(&builder.GetInsertBlock()->back());
+
     return success();
   }
 
@@ -656,8 +667,22 @@ convertOmpSingle(omp::SingleOp &singleOp, llvm::IRBuilderBase &builder,
                         moduleTranslation, bodyGenStatus);
   };
   auto finiCB = [&](InsertPointTy codeGenIP) {};
+
+  // Handle copyprivate
+  Operation::operand_range cpVars = singleOp.getCopyprivateVars();
+  std::optional<ArrayAttr> cpFuncs = singleOp.getCopyprivateFuncs();
+  llvm::SmallVector<llvm::Value *> llvmCPVars;
+  llvm::SmallVector<llvm::Function *> llvmCPFuncs;
+  for (size_t i = 0, e = cpVars.size(); i < e; ++i) {
+    llvmCPVars.push_back(moduleTranslation.lookupValue(cpVars[i]));
+    auto llvmFuncOp = SymbolTable::lookupNearestSymbolFrom<LLVM::LLVMFuncOp>(
+        singleOp, cast<SymbolRefAttr>((*cpFuncs)[i]));
+    llvmCPFuncs.push_back(
+        moduleTranslation.lookupFunction(llvmFuncOp.getName()));
+  }
+
   builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createSingle(
-      ompLoc, bodyCB, finiCB, singleOp.getNowait(), /*DidIt=*/nullptr));
+      ompLoc, bodyCB, finiCB, singleOp.getNowait(), llvmCPVars, llvmCPFuncs));
   return bodyGenStatus;
 }
 
@@ -1000,11 +1025,50 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
   return success();
 }
 
+/// A RAII class that on construction replaces the region arguments of the
+/// parallel op (which correspond to private variables) with the actual private
+/// variables they correspond to. This prepares the parallel op so that it
+/// matches what is expected by the OMPIRBuilder.
+///
+/// On destruction, it restores the original state of the operation so that on
+/// the MLIR side, the op is not affected by conversion to LLVM IR.
+class OmpParallelOpConversionManager {
+public:
+  OmpParallelOpConversionManager(omp::ParallelOp opInst)
+      : region(opInst.getRegion()), privateVars(opInst.getPrivateVars()),
+        privateArgBeginIdx(opInst.getNumReductionVars()),
+        privateArgEndIdx(privateArgBeginIdx + privateVars.size()) {
+    auto privateVarsIt = privateVars.begin();
+
+    for (size_t argIdx = privateArgBeginIdx; argIdx < privateArgEndIdx;
+         ++argIdx, ++privateVarsIt)
+      mlir::replaceAllUsesInRegionWith(region.getArgument(argIdx),
+                                       *privateVarsIt, region);
+  }
+
+  ~OmpParallelOpConversionManager() {
+    auto privateVarsIt = privateVars.begin();
+
+    for (size_t argIdx = privateArgBeginIdx; argIdx < privateArgEndIdx;
+         ++argIdx, ++privateVarsIt)
+      mlir::replaceAllUsesInRegionWith(*privateVarsIt,
+                                       region.getArgument(argIdx), region);
+  }
+
+private:
+  Region &region;
+  OperandRange privateVars;
+  unsigned privateArgBeginIdx;
+  unsigned privateArgEndIdx;
+};
+
 /// Converts the OpenMP parallel operation to LLVM IR.
 static LogicalResult
 convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
                    LLVM::ModuleTranslation &moduleTranslation) {
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+  OmpParallelOpConversionManager raii(opInst);
+
   // TODO: support error propagation in OpenMPIRBuilder and use it instead of
   // relying on captured variables.
   LogicalResult bodyGenStatus = success();
@@ -1086,12 +1150,81 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
 
   // TODO: Perform appropriate actions according to the data-sharing
   // attribute (shared, private, firstprivate, ...) of variables.
-  // Currently defaults to shared.
+  // Currently shared and private are supported.
   auto privCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP,
                     llvm::Value &, llvm::Value &vPtr,
                     llvm::Value *&replacementValue) -> InsertPointTy {
     replacementValue = &vPtr;
 
+    // If this is a private value, this lambda will return the corresponding
+    // mlir value and its `PrivateClauseOp`. Otherwise, empty values are
+    // returned.
+    auto [privVar, privatizerClone] =
+        [&]() -> std::pair<mlir::Value, omp::PrivateClauseOp> {
+      if (!opInst.getPrivateVars().empty()) {
+        auto privVars = opInst.getPrivateVars();
+        auto privatizers = opInst.getPrivatizers();
+
+        for (auto [privVar, privatizerAttr] :
+             llvm::zip_equal(privVars, *privatizers)) {
+          // Find the MLIR private variable corresponding to the LLVM value
+          // being privatized.
+          llvm::Value *llvmPrivVar = moduleTranslation.lookupValue(privVar);
+          if (llvmPrivVar != &vPtr)
+            continue;
+
+          SymbolRefAttr privSym = llvm::cast<SymbolRefAttr>(privatizerAttr);
+          omp::PrivateClauseOp privatizer =
+              SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(
+                  opInst, privSym);
+
+          // Clone the privatizer in case it is used by more than one parallel
+          // region. The privatizer is processed in-place (see below) before it
+          // gets inlined in the parallel region and therefore processing the
+          // original op is dangerous.
+          return {privVar, privatizer.clone()};
+        }
+      }
+
+      return {mlir::Value(), omp::PrivateClauseOp()};
+    }();
+
+    if (privVar) {
+      if (privatizerClone.getDataSharingType() ==
+          omp::DataSharingClauseType::FirstPrivate) {
+        privatizerClone.emitOpError(
+            "TODO: delayed privatization is not "
+            "supported for `firstprivate` clauses yet.");
+        bodyGenStatus = failure();
+        return codeGenIP;
+      }
+
+      Region &allocRegion = privatizerClone.getAllocRegion();
+
+      // Replace the privatizer block argument with mlir value being privatized.
+      // This way, the body of the privatizer will be changed from using the
+      // region/block argument to the value being privatized.
+      auto allocRegionArg = allocRegion.getArgument(0);
+      replaceAllUsesInRegionWith(allocRegionArg, privVar, allocRegion);
+
+      auto oldIP = builder.saveIP();
+      builder.restoreIP(allocaIP);
+
+      SmallVector<llvm::Value *, 1> yieldedValues;
+      if (failed(inlineConvertOmpRegions(allocRegion, "omp.privatizer", builder,
+                                         moduleTranslation, &yieldedValues))) {
+        opInst.emitError("failed to inline `alloc` region of an `omp.private` "
+                         "op in the parallel region");
+        bodyGenStatus = failure();
+      } else {
+        assert(yieldedValues.size() == 1);
+        replacementValue = yieldedValues.front();
+      }
+
+      privatizerClone.erase();
+      builder.restoreIP(oldIP);
+    }
+
     return codeGenIP;
   };
 
@@ -1635,7 +1768,7 @@ getRefPtrIfDeclareTarget(mlir::Value value,
 // A small helper structure to contain data gathered
 // for map lowering and coalese it into one area and
 // avoiding extra computations such as searches in the
-// llvm module for lowered mapped varibles or checking
+// llvm module for lowered mapped variables or checking
 // if something is declare target (and retrieving the
 // value) more than neccessary.
 struct MapInfoData : llvm::OpenMPIRBuilder::MapInfosTy {
@@ -2854,26 +2987,26 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::amendOperation(
                                                 moduleTranslation);
               return failure();
             })
-      .Case(
-          "omp.requires",
-          [&](Attribute attr) {
-            if (auto requiresAttr = attr.dyn_cast<omp::ClauseRequiresAttr>()) {
-              using Requires = omp::ClauseRequires;
-              Requires flags = requiresAttr.getValue();
-              llvm::OpenMPIRBuilderConfig &config =
-                  moduleTranslation.getOpenMPBuilder()->Config;
-              config.setHasRequiresReverseOffload(
-                  bitEnumContainsAll(flags, Requires::reverse_offload));
-              config.setHasRequiresUnifiedAddress(
-                  bitEnumContainsAll(flags, Requires::unified_address));
-              config.setHasRequiresUnifiedSharedMemory(
-                  bitEnumContainsAll(flags, Requires::unified_shared_memory));
-              config.setHasRequiresDynamicAllocators(
-                  bitEnumContainsAll(flags, Requires::dynamic_allocators));
-              return success();
-            }
-            return failure();
-          })
+      .Case("omp.requires",
+            [&](Attribute attr) {
+              if (auto requiresAttr =
+                      attr.dyn_cast<omp::ClauseRequiresAttr>()) {
+                using Requires = omp::ClauseRequires;
+                Requires flags = requiresAttr.getValue();
+                llvm::OpenMPIRBuilderConfig &config =
+                    moduleTranslation.getOpenMPBuilder()->Config;
+                config.setHasRequiresReverseOffload(
+                    bitEnumContainsAll(flags, Requires::reverse_offload));
+                config.setHasRequiresUnifiedAddress(
+                    bitEnumContainsAll(flags, Requires::unified_address));
+                config.setHasRequiresUnifiedSharedMemory(
+                    bitEnumContainsAll(flags, Requires::unified_shared_memory));
+                config.setHasRequiresDynamicAllocators(
+                    bitEnumContainsAll(flags, Requires::dynamic_allocators));
+                return success();
+              }
+              return failure();
+            })
       .Default([](Attribute) {
         // Fall through for omp attributes that do not require lowering.
         return success();
@@ -2988,12 +3121,13 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::convertOperation(
       .Case([&](omp::TargetOp) {
         return convertOmpTarget(*op, builder, moduleTranslation);
       })
-      .Case<omp::MapInfoOp, omp::DataBoundsOp>([&](auto op) {
-        // No-op, should be handled by relevant owning operations e.g.
-        // TargetOp, EnterDataOp, ExitDataOp, DataOp etc. and then
-        // discarded
-        return success();
-      })
+      .Case<omp::MapInfoOp, omp::DataBoundsOp, omp::PrivateClauseOp>(
+          [&](auto op) {
+            // No-op, should be handled by relevant owning operations e.g.
+            // TargetOp, EnterDataOp, ExitDataOp, DataOp etc. and then
+            // discarded
+            return success();
+          })
       .Default([&](Operation *inst) {
         return inst->emitError("unsupported OpenMP operation: ")
                << inst->getName();
diff --git a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
index 93eb456cdc2c..94423b35d1ff 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
@@ -88,7 +88,9 @@ public:
     if (dialect->getKernelAttrHelper().getName() == attribute.getName()) {
       auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
       if (!func)
-        return failure();
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " is only supported on `llvm.func` operations");
+      ;
 
       // For GPU kernels,
       // 1. Insert AMDGPU_KERNEL calling convention.
@@ -100,6 +102,13 @@ public:
       if (!llvmFunc->hasFnAttribute("amdgpu-flat-work-group-size")) {
         llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1,256");
       }
+
+      // MLIR's GPU kernel APIs all assume and produce uniformly-sized
+      // workgroups, so the lowering of the `rocdl.kernel` marker encodes this
+      // assumption. This assumption may be overridden by setting
+      // `rocdl.uniform_work_group_size` on a given function.
+      if (!llvmFunc->hasFnAttribute("uniform-work-group-size"))
+        llvmFunc->addFnAttr("uniform-work-group-size", "true");
     }
     // Override flat-work-group-size
     // TODO: update clients to rocdl.flat_work_group_size instead,
@@ -108,10 +117,12 @@ public:
         attribute.getName()) {
       auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
       if (!func)
-        return failure();
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " is only supported on `llvm.func` operations");
       auto value = dyn_cast<IntegerAttr>(attribute.getValue());
       if (!value)
-        return failure();
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " must be an integer");
 
       llvm::Function *llvmFunc =
           moduleTranslation.lookupFunction(func.getName());
@@ -124,10 +135,12 @@ public:
         attribute.getName()) {
       auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
       if (!func)
-        return failure();
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " is only supported on `llvm.func` operations");
       auto value = dyn_cast<StringAttr>(attribute.getValue());
       if (!value)
-        return failure();
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " must be a string");
 
       llvm::Function *llvmFunc =
           moduleTranslation.lookupFunction(func.getName());
@@ -135,16 +148,32 @@ public:
       llvmAttrValue.append(value.getValue());
       llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue);
     }
-
+    if (ROCDL::ROCDLDialect::getUniformWorkGroupSizeAttrName() ==
+        attribute.getName()) {
+      auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
+      if (!func)
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " is only supported on `llvm.func` operations");
+      auto value = dyn_cast<BoolAttr>(attribute.getValue());
+      if (!value)
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " must be a boolean");
+      llvm::Function *llvmFunc =
+          moduleTranslation.lookupFunction(func.getName());
+      llvmFunc->addFnAttr("uniform-work-group-size",
+                          value.getValue() ? "true" : "false");
+    }
     // Set reqd_work_group_size metadata
     if (dialect->getReqdWorkGroupSizeAttrHelper().getName() ==
         attribute.getName()) {
       auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
       if (!func)
-        return failure();
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " is only supported on `llvm.func` operations");
       auto value = dyn_cast<DenseI32ArrayAttr>(attribute.getValue());
       if (!value)
-        return failure();
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " must be a dense i32 array attribute");
       llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext();
       SmallVector<llvm::Metadata *, 3> metadata;
       llvm::Type *i32 = llvm::IntegerType::get(llvmContext, 32);
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index ee8fffd959c8..c00628a420a0 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -51,11 +51,15 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <optional>
 
+#define DEBUG_TYPE "llvm-dialect-to-llvm-ir"
+
 using namespace mlir;
 using namespace mlir::LLVM;
 using namespace mlir::LLVM::detail;
@@ -1042,17 +1046,80 @@ LogicalResult ModuleTranslation::convertGlobals() {
   for (auto op : getModuleBody(mlirModule).getOps<LLVM::GlobalOp>()) {
     if (Block *initializer = op.getInitializerBlock()) {
       llvm::IRBuilder<> builder(llvmModule->getContext());
+
+      [[maybe_unused]] int numConstantsHit = 0;
+      [[maybe_unused]] int numConstantsErased = 0;
+      DenseMap<llvm::ConstantAggregate *, int> constantAggregateUseMap;
+
       for (auto &op : initializer->without_terminator()) {
-        if (failed(convertOperation(op, builder)) ||
-            !isa<llvm::Constant>(lookupValue(op.getResult(0))))
+        if (failed(convertOperation(op, builder)))
+          return emitError(op.getLoc(), "fail to convert global initializer");
+        auto *cst = dyn_cast<llvm::Constant>(lookupValue(op.getResult(0)));
+        if (!cst)
           return emitError(op.getLoc(), "unemittable constant value");
+
+        // When emitting an LLVM constant, a new constant is created and the old
+        // constant may become dangling and take space. We should remove the
+        // dangling constants to avoid memory explosion especially for constant
+        // arrays whose number of elements is large.
+        // Because multiple operations may refer to the same constant, we need
+        // to count the number of uses of each constant array and remove it only
+        // when the count becomes zero.
+        if (auto *agg = dyn_cast<llvm::ConstantAggregate>(cst)) {
+          numConstantsHit++;
+          Value result = op.getResult(0);
+          int numUsers = std::distance(result.use_begin(), result.use_end());
+          auto [iterator, inserted] =
+              constantAggregateUseMap.try_emplace(agg, numUsers);
+          if (!inserted) {
+            // Key already exists, update the value
+            iterator->second += numUsers;
+          }
+        }
+        // Scan the operands of the operation to decrement the use count of
+        // constants. Erase the constant if the use count becomes zero.
+        for (Value v : op.getOperands()) {
+          auto cst = dyn_cast<llvm::ConstantAggregate>(lookupValue(v));
+          if (!cst)
+            continue;
+          auto iter = constantAggregateUseMap.find(cst);
+          assert(iter != constantAggregateUseMap.end() && "constant not found");
+          iter->second--;
+          if (iter->second == 0) {
+            // NOTE: cannot call removeDeadConstantUsers() here because it
+            // may remove the constant which has uses not be converted yet.
+            if (cst->user_empty()) {
+              cst->destroyConstant();
+              numConstantsErased++;
+            }
+            constantAggregateUseMap.erase(iter);
+          }
+        }
       }
+
       ReturnOp ret = cast<ReturnOp>(initializer->getTerminator());
       llvm::Constant *cst =
           cast<llvm::Constant>(lookupValue(ret.getOperand(0)));
       auto *global = cast<llvm::GlobalVariable>(lookupGlobal(op));
       if (!shouldDropGlobalInitializer(global->getLinkage(), cst))
         global->setInitializer(cst);
+
+      // Try to remove the dangling constants again after all operations are
+      // converted.
+      for (auto it : constantAggregateUseMap) {
+        auto cst = it.first;
+        cst->removeDeadConstantUsers();
+        if (cst->user_empty()) {
+          cst->destroyConstant();
+          numConstantsErased++;
+        }
+      }
+
+      LLVM_DEBUG(llvm::dbgs()
+                     << "Convert initializer for " << op.getName() << "\n";
+                 llvm::dbgs() << numConstantsHit << " new constants hit\n";
+                 llvm::dbgs()
+                 << numConstantsErased << " dangling constants erased\n";);
     }
   }
 
@@ -1445,13 +1512,14 @@ ModuleTranslation::getOrCreateAliasScope(AliasScopeAttr aliasScopeAttr) {
   if (!scopeInserted)
     return scopeIt->second;
   llvm::LLVMContext &ctx = llvmModule->getContext();
+  auto dummy = llvm::MDNode::getTemporary(ctx, std::nullopt);
   // Convert the domain metadata node if necessary.
   auto [domainIt, insertedDomain] = aliasDomainMetadataMapping.try_emplace(
       aliasScopeAttr.getDomain(), nullptr);
   if (insertedDomain) {
     llvm::SmallVector<llvm::Metadata *, 2> operands;
     // Placeholder for self-reference.
-    operands.push_back({});
+    operands.push_back(dummy.get());
     if (StringAttr description = aliasScopeAttr.getDomain().getDescription())
       operands.push_back(llvm::MDString::get(ctx, description));
     domainIt->second = llvm::MDNode::get(ctx, operands);
@@ -1462,7 +1530,7 @@ ModuleTranslation::getOrCreateAliasScope(AliasScopeAttr aliasScopeAttr) {
   assert(domainIt->second && "Scope's domain should already be valid");
   llvm::SmallVector<llvm::Metadata *, 3> operands;
   // Placeholder for self-reference.
-  operands.push_back({});
+  operands.push_back(dummy.get());
   operands.push_back(domainIt->second);
   if (StringAttr description = aliasScopeAttr.getDescription())
     operands.push_back(llvm::MDString::get(ctx, description));
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 4165e0a52428..26899301eb74 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -153,9 +153,9 @@ namespace {
 /// This is useful when saving and undoing a set of rewrites.
 struct RewriterState {
   RewriterState(unsigned numRewrites, unsigned numIgnoredOperations,
-                unsigned numErased)
+                unsigned numErased, unsigned numReplacedOps)
       : numRewrites(numRewrites), numIgnoredOperations(numIgnoredOperations),
-        numErased(numErased) {}
+        numErased(numErased), numReplacedOps(numReplacedOps) {}
 
   /// The current number of rewrites performed.
   unsigned numRewrites;
@@ -165,6 +165,9 @@ struct RewriterState {
 
   /// The current number of erased operations/blocks.
   unsigned numErased;
+
+  /// The current number of replaced ops that are scheduled for erasure.
+  unsigned numReplacedOps;
 };
 
 //===----------------------------------------------------------------------===//
@@ -189,7 +192,6 @@ public:
     EraseBlock,
     InlineBlock,
     MoveBlock,
-    SplitBlock,
     BlockTypeConversion,
     ReplaceBlockArg,
     // Operation rewrites
@@ -397,30 +399,6 @@ private:
   Block *insertBeforeBlock;
 };
 
-/// Splitting of a block. This rewrite is immediately reflected in the IR.
-class SplitBlockRewrite : public BlockRewrite {
-public:
-  SplitBlockRewrite(ConversionPatternRewriterImpl &rewriterImpl, Block *block,
-                    Block *originalBlock)
-      : BlockRewrite(Kind::SplitBlock, rewriterImpl, block),
-        originalBlock(originalBlock) {}
-
-  static bool classof(const IRRewrite *rewrite) {
-    return rewrite->getKind() == Kind::SplitBlock;
-  }
-
-  void rollback() override {
-    // Merge back the block that was split out.
-    originalBlock->getOperations().splice(originalBlock->end(),
-                                          block->getOperations());
-    eraseBlock(block);
-  }
-
-private:
-  // The original block from which this block was split.
-  Block *originalBlock;
-};
-
 /// This structure contains the information pertaining to an argument that has
 /// been converted.
 struct ConvertedArgInfo {
@@ -795,13 +773,12 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
                             PatternRewriter &rewriter, ValueRange values,
                             SmallVectorImpl<Value> &remapped);
 
-  /// Returns true if the given operation is ignored, and does not need to be
+  /// Return "true" if the given operation is ignored, and does not need to be
   /// converted.
   bool isOpIgnored(Operation *op) const;
 
-  /// Recursively marks the nested operations under 'op' as ignored. This
-  /// removes them from being considered for legalization.
-  void markNestedOpsIgnored(Operation *op);
+  /// Return "true" if the given operation was replaced or erased.
+  bool wasOpReplaced(Operation *op) const;
 
   //===--------------------------------------------------------------------===//
   // Type Conversion
@@ -881,9 +858,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   void notifyBlockInserted(Block *block, Region *previous,
                            Region::iterator previousIt) override;
 
-  /// Notifies that a block was split.
-  void notifySplitBlock(Block *block, Block *continuation);
-
   /// Notifies that a block is being inlined into another block.
   void notifyBlockBeingInlined(Block *block, Block *srcBlock,
                                Block::iterator before);
@@ -943,17 +917,17 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// Ordered list of block operations (creations, splits, motions).
   SmallVector<std::unique_ptr<IRRewrite>> rewrites;
 
-  /// A set of operations that should no longer be considered for legalization,
-  /// but were not directly replace/erased/etc. by a pattern. These are
-  /// generally child operations of other operations who were
-  /// replaced/erased/etc. This is not meant to be an exhaustive list of all
-  /// operations, but the minimal set that can be used to detect if a given
-  /// operation should be `ignored`. For example, we may add the operations that
-  /// define non-empty regions to the set, but not any of the others. This
-  /// simplifies the amount of memory needed as we can query if the parent
-  /// operation was ignored.
+  /// A set of operations that should no longer be considered for legalization.
+  /// E.g., ops that are recursively legal. Ops that were replaced/erased are
+  /// tracked separately.
   SetVector<Operation *> ignoredOps;
 
+  /// A set of operations that were replaced/erased. Such ops are not erased
+  /// immediately but only when the dialect conversion succeeds. In the mean
+  /// time, they should no longer be considered for legalization and any attempt
+  /// to modify/access them is invalid rewriter API usage.
+  SetVector<Operation *> replacedOps;
+
   /// The current type converter, or nullptr if no type converter is currently
   /// active.
   const TypeConverter *currentTypeConverter = nullptr;
@@ -1152,7 +1126,7 @@ void ConversionPatternRewriterImpl::applyRewrites() {
 
 RewriterState ConversionPatternRewriterImpl::getCurrentState() {
   return RewriterState(rewrites.size(), ignoredOps.size(),
-                       eraseRewriter.erased.size());
+                       eraseRewriter.erased.size(), replacedOps.size());
 }
 
 void ConversionPatternRewriterImpl::resetState(RewriterState state) {
@@ -1165,6 +1139,9 @@ void ConversionPatternRewriterImpl::resetState(RewriterState state) {
 
   while (eraseRewriter.erased.size() != state.numErased)
     eraseRewriter.erased.pop_back();
+
+  while (replacedOps.size() != state.numReplacedOps)
+    replacedOps.pop_back();
 }
 
 void ConversionPatternRewriterImpl::undoRewrites(unsigned numRewritesToKeep) {
@@ -1229,21 +1206,13 @@ LogicalResult ConversionPatternRewriterImpl::remapValues(
 }
 
 bool ConversionPatternRewriterImpl::isOpIgnored(Operation *op) const {
-  // Check to see if this operation or the parent operation is ignored.
-  return ignoredOps.count(op->getParentOp()) || ignoredOps.count(op);
+  // Check to see if this operation is ignored or was replaced.
+  return replacedOps.count(op) || ignoredOps.count(op);
 }
 
-void ConversionPatternRewriterImpl::markNestedOpsIgnored(Operation *op) {
-  // Walk this operation and collect nested operations that define non-empty
-  // regions. We mark such operations as 'ignored' so that we know we don't have
-  // to convert them, or their nested ops.
-  if (op->getNumRegions() == 0)
-    return;
-  op->walk([&](Operation *op) {
-    if (llvm::any_of(op->getRegions(),
-                     [](Region &region) { return !region.empty(); }))
-      ignoredOps.insert(op);
-  });
+bool ConversionPatternRewriterImpl::wasOpReplaced(Operation *op) const {
+  // Check to see if this operation was replaced.
+  return replacedOps.count(op);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1465,6 +1434,9 @@ void ConversionPatternRewriterImpl::notifyOperationInserted(
     logger.startLine() << "** Insert  : '" << op->getName() << "'(" << op
                        << ")\n";
   });
+  assert(!wasOpReplaced(op->getParentOp()) &&
+         "attempting to insert into a block within a replaced/erased op");
+
   if (!previous.isSet()) {
     // This is a newly created op.
     appendRewrite<CreateOperationRewrite>(op);
@@ -1498,10 +1470,8 @@ void ConversionPatternRewriterImpl::notifyOpReplaced(Operation *op,
   appendRewrite<ReplaceOperationRewrite>(op, currentTypeConverter,
                                          resultChanged);
 
-  // Mark this operation as recursively ignored so that we don't need to
-  // convert any nested operations.
-  ignoredOps.insert(op);
-  markNestedOpsIgnored(op);
+  // Mark this operation and all nested ops as replaced.
+  op->walk([&](Operation *op) { replacedOps.insert(op); });
 }
 
 void ConversionPatternRewriterImpl::notifyBlockIsBeingErased(Block *block) {
@@ -1512,6 +1482,9 @@ void ConversionPatternRewriterImpl::notifyBlockIsBeingErased(Block *block) {
 
 void ConversionPatternRewriterImpl::notifyBlockInserted(
     Block *block, Region *previous, Region::iterator previousIt) {
+  assert(!wasOpReplaced(block->getParentOp()) &&
+         "attempting to insert into a region within a replaced/erased op");
+
   if (!previous) {
     // This is a newly created block.
     appendRewrite<CreateBlockRewrite>(block);
@@ -1521,11 +1494,6 @@ void ConversionPatternRewriterImpl::notifyBlockInserted(
   appendRewrite<MoveBlockRewrite>(block, previous, prevBlock);
 }
 
-void ConversionPatternRewriterImpl::notifySplitBlock(Block *block,
-                                                     Block *continuation) {
-  appendRewrite<SplitBlockRewrite>(continuation, block);
-}
-
 void ConversionPatternRewriterImpl::notifyBlockBeingInlined(
     Block *block, Block *srcBlock, Block::iterator before) {
   appendRewrite<InlineBlockRewrite>(block, srcBlock, before);
@@ -1593,6 +1561,9 @@ void ConversionPatternRewriter::eraseOp(Operation *op) {
 }
 
 void ConversionPatternRewriter::eraseBlock(Block *block) {
+  assert(!impl->wasOpReplaced(block->getParentOp()) &&
+         "attempting to erase a block within a replaced/erased op");
+
   // Mark all ops for erasure.
   for (Operation &op : *block)
     eraseOp(&op);
@@ -1608,18 +1579,27 @@ void ConversionPatternRewriter::eraseBlock(Block *block) {
 Block *ConversionPatternRewriter::applySignatureConversion(
     Region *region, TypeConverter::SignatureConversion &conversion,
     const TypeConverter *converter) {
+  assert(!impl->wasOpReplaced(region->getParentOp()) &&
+         "attempting to apply a signature conversion to a block within a "
+         "replaced/erased op");
   return impl->applySignatureConversion(region, conversion, converter);
 }
 
 FailureOr<Block *> ConversionPatternRewriter::convertRegionTypes(
     Region *region, const TypeConverter &converter,
     TypeConverter::SignatureConversion *entryConversion) {
+  assert(!impl->wasOpReplaced(region->getParentOp()) &&
+         "attempting to apply a signature conversion to a block within a "
+         "replaced/erased op");
   return impl->convertRegionTypes(region, converter, entryConversion);
 }
 
 LogicalResult ConversionPatternRewriter::convertNonEntryRegionTypes(
     Region *region, const TypeConverter &converter,
     ArrayRef<TypeConverter::SignatureConversion> blockConversions) {
+  assert(!impl->wasOpReplaced(region->getParentOp()) &&
+         "attempting to apply a signature conversion to a block within a "
+         "replaced/erased op");
   return impl->convertNonEntryRegionTypes(region, converter, blockConversions);
 }
 
@@ -1652,25 +1632,22 @@ ConversionPatternRewriter::getRemappedValues(ValueRange keys,
                            results);
 }
 
-Block *ConversionPatternRewriter::splitBlock(Block *block,
-                                             Block::iterator before) {
-  auto *continuation = block->splitBlock(before);
-  impl->notifySplitBlock(block, continuation);
-  return continuation;
-}
-
 void ConversionPatternRewriter::inlineBlockBefore(Block *source, Block *dest,
                                                   Block::iterator before,
                                                   ValueRange argValues) {
+#ifndef NDEBUG
   assert(argValues.size() == source->getNumArguments() &&
          "incorrect # of argument replacement values");
-#ifndef NDEBUG
+  assert(!impl->wasOpReplaced(source->getParentOp()) &&
+         "attempting to inline a block from a replaced/erased op");
+  assert(!impl->wasOpReplaced(dest->getParentOp()) &&
+         "attempting to inline a block into a replaced/erased op");
   auto opIgnored = [&](Operation *op) { return impl->isOpIgnored(op); };
-#endif // NDEBUG
   // The source block will be deleted, so it should not have any users (i.e.,
   // there should be no predecessors).
   assert(llvm::all_of(source->getUsers(), opIgnored) &&
          "expected 'source' to have no predecessors");
+#endif // NDEBUG
 
   impl->notifyBlockBeingInlined(dest, source, before);
   for (auto it : llvm::zip(source->getArguments(), argValues))
@@ -1680,6 +1657,8 @@ void ConversionPatternRewriter::inlineBlockBefore(Block *source, Block *dest,
 }
 
 void ConversionPatternRewriter::startOpModification(Operation *op) {
+  assert(!impl->wasOpReplaced(op) &&
+         "attempting to modify a replaced/erased op");
 #ifndef NDEBUG
   impl->pendingRootUpdates.insert(op);
 #endif
@@ -1687,6 +1666,8 @@ void ConversionPatternRewriter::startOpModification(Operation *op) {
 }
 
 void ConversionPatternRewriter::finalizeOpModification(Operation *op) {
+  assert(!impl->wasOpReplaced(op) &&
+         "attempting to modify a replaced/erased op");
   PatternRewriter::finalizeOpModification(op);
   // There is nothing to do here, we only need to track the operation at the
   // start of the update.
@@ -1901,8 +1882,13 @@ OperationLegalizer::legalize(Operation *op,
 
     // If this operation is recursively legal, mark its children as ignored so
     // that we don't consider them for legalization.
-    if (legalityInfo->isRecursivelyLegal)
-      rewriter.getImpl().markNestedOpsIgnored(op);
+    if (legalityInfo->isRecursivelyLegal) {
+      op->walk([&](Operation *nested) {
+        if (op != nested)
+          rewriter.getImpl().ignoredOps.insert(nested);
+      });
+    }
+
     return success();
   }
 
diff --git a/mlir/test/CAPI/sparse_tensor.c b/mlir/test/CAPI/sparse_tensor.c
index a8b9f9048d59..f241e0e5c2fb 100644
--- a/mlir/test/CAPI/sparse_tensor.c
+++ b/mlir/test/CAPI/sparse_tensor.c
@@ -39,8 +39,8 @@ static int testRoundtripEncoding(MlirContext ctx) {
   // CHECK: (d0, d1)[s0] -> (s0, d0, d1)
   mlirAffineMapDump(dimToLvl);
   // CHECK: level_type: 65536
-  // CHECK: level_type: 131072
-  // CHECK: level_type: 131072
+  // CHECK: level_type: 262144
+  // CHECK: level_type: 262144
   MlirAffineMap lvlToDim =
       mlirSparseTensorEncodingAttrGetLvlToDim(originalAttr);
   int lvlRank = mlirSparseTensorEncodingGetLvlRank(originalAttr);
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 74921544c555..baf07ea1f010 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -173,6 +173,7 @@ if(LLVM_BUILD_EXAMPLES)
     transform-opt-ch3
     transform-opt-ch4
     mlir-minimal-opt
+    mlir-transform-opt
     )
   if(MLIR_ENABLE_EXECUTION_ENGINE)
     list(APPEND MLIR_TEST_DEPENDS
diff --git a/mlir/test/Conversion/AffineToStandard/lower-affine.mlir b/mlir/test/Conversion/AffineToStandard/lower-affine.mlir
index 00d7b6b8d65f..23e0edd510cb 100644
--- a/mlir/test/Conversion/AffineToStandard/lower-affine.mlir
+++ b/mlir/test/Conversion/AffineToStandard/lower-affine.mlir
@@ -927,3 +927,57 @@ func.func @affine_parallel_with_reductions_i64(%arg0: memref<3x3xi64>, %arg1: me
 // CHECK:      scf.reduce.return %[[RES]] : i64
 // CHECK:    }
 // CHECK:  }
+
+///////////////////////////////////////////////////////////////////////
+
+func.func @test_dilinearize_index(%linear_index: index) -> (index, index, index) {
+  %b0 = arith.constant 16 : index
+  %b1 = arith.constant 224 : index
+  %b2 = arith.constant 224 : index
+  %1:3 = affine.delinearize_index %linear_index into (%b0, %b1, %b2) : index, index, index
+  return %1#0, %1#1, %1#2 : index, index, index
+}
+// CHECK-LABEL:   func.func @test_dilinearize_index(
+// CHECK-SAME:                                      %[[VAL_0:.*]]: index) -> (index, index, index) {
+// CHECK:           %[[VAL_1:.*]] = arith.constant 16 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 224 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 224 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 50176 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 50176 : index
+// CHECK:           %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_7:.*]] = arith.constant -1 : index
+// CHECK:           %[[VAL_8:.*]] = arith.cmpi slt, %[[VAL_0]], %[[VAL_6]] : index
+// CHECK:           %[[VAL_9:.*]] = arith.subi %[[VAL_7]], %[[VAL_0]] : index
+// CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_8]], %[[VAL_9]], %[[VAL_0]] : index
+// CHECK:           %[[VAL_11:.*]] = arith.divsi %[[VAL_10]], %[[VAL_5]] : index
+// CHECK:           %[[VAL_12:.*]] = arith.subi %[[VAL_7]], %[[VAL_11]] : index
+// CHECK:           %[[VAL_13:.*]] = arith.select %[[VAL_8]], %[[VAL_12]], %[[VAL_11]] : index
+// CHECK:           %[[VAL_14:.*]] = arith.constant 50176 : index
+// CHECK:           %[[VAL_15:.*]] = arith.remsi %[[VAL_0]], %[[VAL_14]] : index
+// CHECK:           %[[VAL_16:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_17:.*]] = arith.cmpi slt, %[[VAL_15]], %[[VAL_16]] : index
+// CHECK:           %[[VAL_18:.*]] = arith.addi %[[VAL_15]], %[[VAL_14]] : index
+// CHECK:           %[[VAL_19:.*]] = arith.select %[[VAL_17]], %[[VAL_18]], %[[VAL_15]] : index
+// CHECK:           %[[VAL_20:.*]] = arith.constant 50176 : index
+// CHECK:           %[[VAL_21:.*]] = arith.remsi %[[VAL_0]], %[[VAL_20]] : index
+// CHECK:           %[[VAL_22:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_23:.*]] = arith.cmpi slt, %[[VAL_21]], %[[VAL_22]] : index
+// CHECK:           %[[VAL_24:.*]] = arith.addi %[[VAL_21]], %[[VAL_20]] : index
+// CHECK:           %[[VAL_25:.*]] = arith.select %[[VAL_23]], %[[VAL_24]], %[[VAL_21]] : index
+// CHECK:           %[[VAL_26:.*]] = arith.constant 224 : index
+// CHECK:           %[[VAL_27:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_28:.*]] = arith.constant -1 : index
+// CHECK:           %[[VAL_29:.*]] = arith.cmpi slt, %[[VAL_25]], %[[VAL_27]] : index
+// CHECK:           %[[VAL_30:.*]] = arith.subi %[[VAL_28]], %[[VAL_25]] : index
+// CHECK:           %[[VAL_31:.*]] = arith.select %[[VAL_29]], %[[VAL_30]], %[[VAL_25]] : index
+// CHECK:           %[[VAL_32:.*]] = arith.divsi %[[VAL_31]], %[[VAL_26]] : index
+// CHECK:           %[[VAL_33:.*]] = arith.subi %[[VAL_28]], %[[VAL_32]] : index
+// CHECK:           %[[VAL_34:.*]] = arith.select %[[VAL_29]], %[[VAL_33]], %[[VAL_32]] : index
+// CHECK:           %[[VAL_35:.*]] = arith.constant 224 : index
+// CHECK:           %[[VAL_36:.*]] = arith.remsi %[[VAL_0]], %[[VAL_35]] : index
+// CHECK:           %[[VAL_37:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_38:.*]] = arith.cmpi slt, %[[VAL_36]], %[[VAL_37]] : index
+// CHECK:           %[[VAL_39:.*]] = arith.addi %[[VAL_36]], %[[VAL_35]] : index
+// CHECK:           %[[VAL_40:.*]] = arith.select %[[VAL_38]], %[[VAL_39]], %[[VAL_36]] : index
+// CHECK:           return %[[VAL_13]], %[[VAL_34]], %[[VAL_40]] : index, index, index
+// CHECK:         }
diff --git a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
index 1fe843b1447a..39af7dd02a62 100644
--- a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
+++ b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
@@ -1045,4 +1045,115 @@ func.func @complex_mul_with_fmf(%lhs: complex<f32>, %rhs: complex<f32>) -> compl
 // CHECK: %[[FINAL_IMAG:.*]] = arith.select %[[RECALC3]], %[[NEW_IMAG_TIMES_INF]], %[[IMAG]] : f32
 
 // CHECK: %[[RESULT:.*]] = complex.create %[[FINAL_REAL]], %[[FINAL_IMAG]] : complex<f32>
-// CHECK: return %[[RESULT]] : complex<f32>
-\ No newline at end of file
+// CHECK: return %[[RESULT]] : complex<f32>
+
+// -----
+
+// CHECK-LABEL: func @complex_div_with_fmf
+// CHECK-SAME: (%[[LHS:.*]]: complex<f32>, %[[RHS:.*]]: complex<f32>)
+func.func @complex_div_with_fmf(%lhs: complex<f32>, %rhs: complex<f32>) -> complex<f32> {
+  %div = complex.div %lhs, %rhs fastmath<nnan,contract> : complex<f32>
+  return %div : complex<f32>
+}
+// CHECK: %[[LHS_REAL:.*]] = complex.re %[[LHS]] : complex<f32>
+// CHECK: %[[LHS_IMAG:.*]] = complex.im %[[LHS]] : complex<f32>
+// CHECK: %[[RHS_REAL:.*]] = complex.re %[[RHS]] : complex<f32>
+// CHECK: %[[RHS_IMAG:.*]] = complex.im %[[RHS]] : complex<f32>
+
+// CHECK: %[[RHS_REAL_IMAG_RATIO:.*]] = arith.divf %[[RHS_REAL]], %[[RHS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[RHS_REAL_TIMES_RHS_REAL_IMAG_RATIO:.*]] = arith.mulf %[[RHS_REAL_IMAG_RATIO]], %[[RHS_REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[RHS_REAL_IMAG_DENOM:.*]] = arith.addf %[[RHS_IMAG]], %[[RHS_REAL_TIMES_RHS_REAL_IMAG_RATIO]] fastmath<nnan,contract> : f32
+// CHECK: %[[LHS_REAL_TIMES_RHS_REAL_IMAG_RATIO:.*]] = arith.mulf %[[LHS_REAL]], %[[RHS_REAL_IMAG_RATIO]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_NUMERATOR_1:.*]] = arith.addf %[[LHS_REAL_TIMES_RHS_REAL_IMAG_RATIO]], %[[LHS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[RESULT_REAL_1:.*]] = arith.divf %[[REAL_NUMERATOR_1]], %[[RHS_REAL_IMAG_DENOM]] fastmath<nnan,contract> : f32
+// CHECK: %[[LHS_IMAG_TIMES_RHS_REAL_IMAG_RATIO:.*]] = arith.mulf %[[LHS_IMAG]], %[[RHS_REAL_IMAG_RATIO]] fastmath<nnan,contract> : f32
+// CHECK: %[[IMAG_NUMERATOR_1:.*]] = arith.subf %[[LHS_IMAG_TIMES_RHS_REAL_IMAG_RATIO]], %[[LHS_REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[RESULT_IMAG_1:.*]] = arith.divf %[[IMAG_NUMERATOR_1]], %[[RHS_REAL_IMAG_DENOM]] fastmath<nnan,contract> : f32
+
+// CHECK: %[[RHS_IMAG_REAL_RATIO:.*]] = arith.divf %[[RHS_IMAG]], %[[RHS_REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[RHS_IMAG_TIMES_RHS_IMAG_REAL_RATIO:.*]] = arith.mulf %[[RHS_IMAG_REAL_RATIO]], %[[RHS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[RHS_IMAG_REAL_DENOM:.*]] = arith.addf %[[RHS_REAL]], %[[RHS_IMAG_TIMES_RHS_IMAG_REAL_RATIO]] fastmath<nnan,contract> : f32
+// CHECK: %[[LHS_IMAG_TIMES_RHS_IMAG_REAL_RATIO:.*]] = arith.mulf %[[LHS_IMAG]], %[[RHS_IMAG_REAL_RATIO]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_NUMERATOR_2:.*]] = arith.addf %[[LHS_REAL]], %[[LHS_IMAG_TIMES_RHS_IMAG_REAL_RATIO]] fastmath<nnan,contract> : f32
+// CHECK: %[[RESULT_REAL_2:.*]] = arith.divf %[[REAL_NUMERATOR_2]], %[[RHS_IMAG_REAL_DENOM]] fastmath<nnan,contract> : f32
+// CHECK: %[[LHS_REAL_TIMES_RHS_IMAG_REAL_RATIO:.*]] = arith.mulf %[[LHS_REAL]], %[[RHS_IMAG_REAL_RATIO]] fastmath<nnan,contract> : f32
+// CHECK: %[[IMAG_NUMERATOR_2:.*]] = arith.subf %[[LHS_IMAG]], %[[LHS_REAL_TIMES_RHS_IMAG_REAL_RATIO]] fastmath<nnan,contract> : f32
+// CHECK: %[[RESULT_IMAG_2:.*]] = arith.divf %[[IMAG_NUMERATOR_2]], %[[RHS_IMAG_REAL_DENOM]] fastmath<nnan,contract> : f32
+
+// Case 1. Zero denominator, numerator contains at most one NaN value.
+// CHECK: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[RHS_REAL_ABS:.*]] = math.absf %[[RHS_REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[RHS_REAL_ABS_IS_ZERO:.*]] = arith.cmpf oeq, %[[RHS_REAL_ABS]], %[[ZERO]] : f32
+// CHECK: %[[RHS_IMAG_ABS:.*]] = math.absf %[[RHS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[RHS_IMAG_ABS_IS_ZERO:.*]] = arith.cmpf oeq, %[[RHS_IMAG_ABS]], %[[ZERO]] : f32
+// CHECK: %[[LHS_REAL_IS_NOT_NAN:.*]] = arith.cmpf ord, %[[LHS_REAL]], %[[ZERO]] : f32
+// CHECK: %[[LHS_IMAG_IS_NOT_NAN:.*]] = arith.cmpf ord, %[[LHS_IMAG]], %[[ZERO]] : f32
+// CHECK: %[[LHS_CONTAINS_NOT_NAN_VALUE:.*]] = arith.ori %[[LHS_REAL_IS_NOT_NAN]], %[[LHS_IMAG_IS_NOT_NAN]] : i1
+// CHECK: %[[RHS_IS_ZERO:.*]] = arith.andi %[[RHS_REAL_ABS_IS_ZERO]], %[[RHS_IMAG_ABS_IS_ZERO]] : i1
+// CHECK: %[[RESULT_IS_INFINITY:.*]] = arith.andi %[[LHS_CONTAINS_NOT_NAN_VALUE]], %[[RHS_IS_ZERO]] : i1
+// CHECK: %[[INF:.*]] = arith.constant 0x7F800000 : f32
+// CHECK: %[[INF_WITH_SIGN_OF_RHS_REAL:.*]] = math.copysign %[[INF]], %[[RHS_REAL]] : f32
+// CHECK: %[[INFINITY_RESULT_REAL:.*]] = arith.mulf %[[INF_WITH_SIGN_OF_RHS_REAL]], %[[LHS_REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[INFINITY_RESULT_IMAG:.*]] = arith.mulf %[[INF_WITH_SIGN_OF_RHS_REAL]], %[[LHS_IMAG]] fastmath<nnan,contract> : f32
+
+// Case 2. Infinite numerator, finite denominator.
+// CHECK: %[[RHS_REAL_FINITE:.*]] = arith.cmpf one, %[[RHS_REAL_ABS]], %[[INF]] : f32
+// CHECK: %[[RHS_IMAG_FINITE:.*]] = arith.cmpf one, %[[RHS_IMAG_ABS]], %[[INF]] : f32
+// CHECK: %[[RHS_IS_FINITE:.*]] = arith.andi %[[RHS_REAL_FINITE]], %[[RHS_IMAG_FINITE]] : i1
+// CHECK: %[[LHS_REAL_ABS:.*]] = math.absf %[[LHS_REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[LHS_REAL_INFINITE:.*]] = arith.cmpf oeq, %[[LHS_REAL_ABS]], %[[INF]] : f32
+// CHECK: %[[LHS_IMAG_ABS:.*]] = math.absf %[[LHS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[LHS_IMAG_INFINITE:.*]] = arith.cmpf oeq, %[[LHS_IMAG_ABS]], %[[INF]] : f32
+// CHECK: %[[LHS_IS_INFINITE:.*]] = arith.ori %[[LHS_REAL_INFINITE]], %[[LHS_IMAG_INFINITE]] : i1
+// CHECK: %[[INF_NUM_FINITE_DENOM:.*]] = arith.andi %[[LHS_IS_INFINITE]], %[[RHS_IS_FINITE]] : i1
+// CHECK: %[[ONE:.*]] = arith.constant 1.000000e+00 : f32
+// CHECK: %[[LHS_REAL_IS_INF:.*]] = arith.select %[[LHS_REAL_INFINITE]], %[[ONE]], %[[ZERO]] : f32
+// CHECK: %[[LHS_REAL_IS_INF_WITH_SIGN:.*]] = math.copysign %[[LHS_REAL_IS_INF]], %[[LHS_REAL]] : f32
+// CHECK: %[[LHS_IMAG_IS_INF:.*]] = arith.select %[[LHS_IMAG_INFINITE]], %[[ONE]], %[[ZERO]] : f32
+// CHECK: %[[LHS_IMAG_IS_INF_WITH_SIGN:.*]] = math.copysign %[[LHS_IMAG_IS_INF]], %[[LHS_IMAG]] : f32
+// CHECK: %[[LHS_REAL_IS_INF_WITH_SIGN_TIMES_RHS_REAL:.*]] = arith.mulf %[[LHS_REAL_IS_INF_WITH_SIGN]], %[[RHS_REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[LHS_IMAG_IS_INF_WITH_SIGN_TIMES_RHS_IMAG:.*]] = arith.mulf %[[LHS_IMAG_IS_INF_WITH_SIGN]], %[[RHS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[INF_MULTIPLICATOR_1:.*]] = arith.addf %[[LHS_REAL_IS_INF_WITH_SIGN_TIMES_RHS_REAL]], %[[LHS_IMAG_IS_INF_WITH_SIGN_TIMES_RHS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[RESULT_REAL_3:.*]] = arith.mulf %[[INF]], %[[INF_MULTIPLICATOR_1]] fastmath<nnan,contract> : f32
+// CHECK: %[[LHS_REAL_IS_INF_WITH_SIGN_TIMES_RHS_IMAG:.*]] = arith.mulf %[[LHS_REAL_IS_INF_WITH_SIGN]], %[[RHS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[LHS_IMAG_IS_INF_WITH_SIGN_TIMES_RHS_REAL:.*]] = arith.mulf %[[LHS_IMAG_IS_INF_WITH_SIGN]], %[[RHS_REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[INF_MULTIPLICATOR_2:.*]] = arith.subf %[[LHS_IMAG_IS_INF_WITH_SIGN_TIMES_RHS_REAL]], %[[LHS_REAL_IS_INF_WITH_SIGN_TIMES_RHS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[RESULT_IMAG_3:.*]] = arith.mulf %[[INF]], %[[INF_MULTIPLICATOR_2]] fastmath<nnan,contract> : f32
+
+// Case 3. Finite numerator, infinite denominator.
+// CHECK: %[[LHS_REAL_FINITE:.*]] = arith.cmpf one, %[[LHS_REAL_ABS]], %[[INF]] : f32
+// CHECK: %[[LHS_IMAG_FINITE:.*]] = arith.cmpf one, %[[LHS_IMAG_ABS]], %[[INF]] : f32
+// CHECK: %[[LHS_IS_FINITE:.*]] = arith.andi %[[LHS_REAL_FINITE]], %[[LHS_IMAG_FINITE]] : i1
+// CHECK: %[[RHS_REAL_INFINITE:.*]] = arith.cmpf oeq, %[[RHS_REAL_ABS]], %[[INF]] : f32
+// CHECK: %[[RHS_IMAG_INFINITE:.*]] = arith.cmpf oeq, %[[RHS_IMAG_ABS]], %[[INF]] : f32
+// CHECK: %[[RHS_IS_INFINITE:.*]] = arith.ori %[[RHS_REAL_INFINITE]], %[[RHS_IMAG_INFINITE]] : i1
+// CHECK: %[[FINITE_NUM_INFINITE_DENOM:.*]] = arith.andi %[[LHS_IS_FINITE]], %[[RHS_IS_INFINITE]] : i1
+// CHECK: %[[RHS_REAL_IS_INF:.*]] = arith.select %[[RHS_REAL_INFINITE]], %[[ONE]], %[[ZERO]] : f32
+// CHECK: %[[RHS_REAL_IS_INF_WITH_SIGN:.*]] = math.copysign %[[RHS_REAL_IS_INF]], %[[RHS_REAL]] : f32
+// CHECK: %[[RHS_IMAG_IS_INF:.*]] = arith.select %[[RHS_IMAG_INFINITE]], %[[ONE]], %[[ZERO]] : f32
+// CHECK: %[[RHS_IMAG_IS_INF_WITH_SIGN:.*]] = math.copysign %[[RHS_IMAG_IS_INF]], %[[RHS_IMAG]] : f32
+// CHECK: %[[RHS_REAL_IS_INF_WITH_SIGN_TIMES_LHS_REAL:.*]] = arith.mulf %[[LHS_REAL]], %[[RHS_REAL_IS_INF_WITH_SIGN]] fastmath<nnan,contract> : f32
+// CHECK: %[[RHS_IMAG_IS_INF_WITH_SIGN_TIMES_LHS_IMAG:.*]] = arith.mulf %[[LHS_IMAG]], %[[RHS_IMAG_IS_INF_WITH_SIGN]] fastmath<nnan,contract> : f32
+// CHECK: %[[ZERO_MULTIPLICATOR_1:.*]] = arith.addf %[[RHS_REAL_IS_INF_WITH_SIGN_TIMES_LHS_REAL]], %[[RHS_IMAG_IS_INF_WITH_SIGN_TIMES_LHS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[RESULT_REAL_4:.*]] = arith.mulf %[[ZERO]], %[[ZERO_MULTIPLICATOR_1]] fastmath<nnan,contract> : f32
+// CHECK: %[[RHS_REAL_IS_INF_WITH_SIGN_TIMES_LHS_IMAG:.*]] = arith.mulf %[[LHS_IMAG]], %[[RHS_REAL_IS_INF_WITH_SIGN]] fastmath<nnan,contract> : f32
+// CHECK: %[[RHS_IMAG_IS_INF_WITH_SIGN_TIMES_LHS_REAL:.*]] = arith.mulf %[[LHS_REAL]], %[[RHS_IMAG_IS_INF_WITH_SIGN]] fastmath<nnan,contract> : f32
+// CHECK: %[[ZERO_MULTIPLICATOR_2:.*]] = arith.subf %[[RHS_REAL_IS_INF_WITH_SIGN_TIMES_LHS_IMAG]], %[[RHS_IMAG_IS_INF_WITH_SIGN_TIMES_LHS_REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[RESULT_IMAG_4:.*]] = arith.mulf %[[ZERO]], %[[ZERO_MULTIPLICATOR_2]] fastmath<nnan,contract> : f32
+
+// CHECK: %[[REAL_ABS_SMALLER_THAN_IMAG_ABS:.*]] = arith.cmpf olt, %[[RHS_REAL_ABS]], %[[RHS_IMAG_ABS]] : f32
+// CHECK: %[[RESULT_REAL:.*]] = arith.select %[[REAL_ABS_SMALLER_THAN_IMAG_ABS]], %[[RESULT_REAL_1]], %[[RESULT_REAL_2]] : f32
+// CHECK: %[[RESULT_IMAG:.*]] = arith.select %[[REAL_ABS_SMALLER_THAN_IMAG_ABS]], %[[RESULT_IMAG_1]], %[[RESULT_IMAG_2]] : f32
+// CHECK: %[[RESULT_REAL_SPECIAL_CASE_3:.*]] = arith.select %[[FINITE_NUM_INFINITE_DENOM]], %[[RESULT_REAL_4]], %[[RESULT_REAL]] : f32
+// CHECK: %[[RESULT_IMAG_SPECIAL_CASE_3:.*]] = arith.select %[[FINITE_NUM_INFINITE_DENOM]], %[[RESULT_IMAG_4]], %[[RESULT_IMAG]] : f32
+// CHECK: %[[RESULT_REAL_SPECIAL_CASE_2:.*]] = arith.select %[[INF_NUM_FINITE_DENOM]], %[[RESULT_REAL_3]], %[[RESULT_REAL_SPECIAL_CASE_3]] : f32
+// CHECK: %[[RESULT_IMAG_SPECIAL_CASE_2:.*]] = arith.select %[[INF_NUM_FINITE_DENOM]], %[[RESULT_IMAG_3]], %[[RESULT_IMAG_SPECIAL_CASE_3]] : f32
+// CHECK: %[[RESULT_REAL_SPECIAL_CASE_1:.*]] = arith.select %[[RESULT_IS_INFINITY]], %[[INFINITY_RESULT_REAL]], %[[RESULT_REAL_SPECIAL_CASE_2]] : f32
+// CHECK: %[[RESULT_IMAG_SPECIAL_CASE_1:.*]] = arith.select %[[RESULT_IS_INFINITY]], %[[INFINITY_RESULT_IMAG]], %[[RESULT_IMAG_SPECIAL_CASE_2]] : f32
+// CHECK: %[[RESULT_REAL_IS_NAN:.*]] = arith.cmpf uno, %[[RESULT_REAL]], %[[ZERO]] : f32
+// CHECK: %[[RESULT_IMAG_IS_NAN:.*]] = arith.cmpf uno, %[[RESULT_IMAG]], %[[ZERO]] : f32
+// CHECK: %[[RESULT_IS_NAN:.*]] = arith.andi %[[RESULT_REAL_IS_NAN]], %[[RESULT_IMAG_IS_NAN]] : i1
+// CHECK: %[[RESULT_REAL_WITH_SPECIAL_CASES:.*]] = arith.select %[[RESULT_IS_NAN]], %[[RESULT_REAL_SPECIAL_CASE_1]], %[[RESULT_REAL]] : f32
+// CHECK: %[[RESULT_IMAG_WITH_SPECIAL_CASES:.*]] = arith.select %[[RESULT_IS_NAN]], %[[RESULT_IMAG_SPECIAL_CASE_1]], %[[RESULT_IMAG]] : f32
+// CHECK: %[[RESULT:.*]] = complex.create %[[RESULT_REAL_WITH_SPECIAL_CASES]], %[[RESULT_IMAG_WITH_SPECIAL_CASES]] : complex<f32>
+// CHECK: return %[[RESULT]] : complex<f32>
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index 2652b8665709..8a2d8bd7967c 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -1,6 +1,8 @@
 // RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s
 // RUN: mlir-opt %s -convert-gpu-to-rocdl='index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s
 
+// CHECK-LABEL: @test_module
+// CHECK-SAME: llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
 gpu.module @test_module {
   // CHECK-LABEL: func @gpu_index_ops()
   // CHECK32-LABEL: func @gpu_index_ops()
@@ -628,3 +630,11 @@ gpu.module @test_module {
     func.return %shfl, %shfli : f32, f32
   }
 }
+
+// -----
+
+// CHECK-LABEL: @test_custom_data_layout
+// CHECK-SAME: llvm.data_layout = "e"
+gpu.module @test_custom_data_layout attributes {llvm.data_layout = "e"} {
+
+}
diff --git a/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir b/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir
index a1de1ff87c22..983eee732e2a 100644
--- a/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir
+++ b/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir
@@ -1,13 +1,13 @@
-// RUN: mlir-opt  %s --pass-pipeline='builtin.module(func.func(amdgpu-optimize-shared-memory))' | FileCheck %s
+// RUN: mlir-opt %s --pass-pipeline='builtin.module(func.func(amdgpu-optimize-shared-memory))' | FileCheck %s
   
   // CHECK: @optimize_shmem([[arg0:%.+]]: memref<{{.*}}>, [[readRow:%.+]]: index, [[readCol:%.+]]: index, [[writeRow:%.+]]: index, [[writeCol:%.+]]: index, [[fragRow:%.+]]: index, [[fragCol:%.+]]: index, [[fragColPerm:%.+]]: index, [[stRow:%.+]]: index, [[stCol:%.+]]: index)
-  func.func @optimize_shmem(%arg0: memref<4096x4096xf16>, 
+  func.func @optimize_shmem(%arg0: memref<4096x4096xf16>,
                     %readRow: index, %readCol: index,
                     %writeRow: index, %writeCol: index,
-                    %fragRow: index, %fragCol: index, 
+                    %fragRow: index, %fragCol: index,
                     %fragColPerm: index,
                     %stRow: index, %stCol: index) {
-    // CHECK:    %[[cst:.+]] = arith.constant 0.000000e+00 : f16                  
+    // CHECK:    %[[cst:.+]] = arith.constant 0.000000e+00 : f16
     %cst = arith.constant 0.000000e+00 : f16
 
     // CHECK: [[shmA:%.+]] = memref.alloc
@@ -15,42 +15,36 @@
     %shmA = memref.alloc() {alignment = 64 : i64} : memref<128x32xf16, 3>
     %shmB = memref.alloc() {alignment = 64 : i64} : memref<256x32xf16, 3>
 
-    // CHECK: %[[D0:.+]] = vector.transfer_read [[arg0:%.+]][[[readRow:%.+]], [[readCol:%.+]]], [[cst:.+]] {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
     %0 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                  
-    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c7]]       
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]     
-    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]  
-    // CHECK: vector.transfer_write %[[D0:.+]], [[shmB]][[[writeRow:%.+]], [[writeCol:%.+]]] {in_bounds = [true, true]} : vector<1x8xf16>, memref<256x32xf16, 3>
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
+    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]
     vector.transfer_write %0, %shmB[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<256x32xf16, 3>
     gpu.barrier
     gpu.barrier
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                     
-    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c7]]     
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]       
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
     // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]] 
-    // CHECK:  vector.load [[shmB:%.+]][[[fragRow:%.+]], [[fragColPerm]]] : memref<256x32xf16, 3>, vector<8xf16>
     %1 = vector.load %shmB[%fragRow, %fragColPerm] : memref<256x32xf16, 3>, vector<8xf16>
 
-    // CHECK: %[[D2:.+]] = vector.transfer_read [[arg0:%.+]][[[readRow:%.+]], [[readCol:%.+]]], [[cst:.+]] {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
     %2 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                  
-    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c7]]       
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]     
-    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]  
-    // CHECK: vector.transfer_write %[[D2:.+]], [[shmA:%.+]][[[writeRow:%.+]], [[writeCol:%.+]]] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x32xf16, 3>
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
+    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]
     vector.transfer_write %2, %shmA[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x32xf16, 3>
     gpu.barrier
     gpu.barrier
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                     
-    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c7]]          
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                     
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]] 
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
     // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]]
-    // CHECK:  vector.load [[shmA:%.+]][[[fragRow:%.+]], [[fragColPerm]]] : memref<128x32xf16, 3>, vector<8xf16>
     %3 = vector.load %shmA[%fragRow, %fragColPerm] : memref<128x32xf16, 3>, vector<8xf16>
     return
   }
diff --git a/mlir/test/Dialect/AMDGPU/transform_optimize_shmem_reads_writes.mlir b/mlir/test/Dialect/AMDGPU/transform_optimize_shmem_reads_writes.mlir
index 143e7c2d2709..b1bb91ffc297 100644
--- a/mlir/test/Dialect/AMDGPU/transform_optimize_shmem_reads_writes.mlir
+++ b/mlir/test/Dialect/AMDGPU/transform_optimize_shmem_reads_writes.mlir
@@ -1,10 +1,10 @@
-// RUN: mlir-opt  %s -transform-interpreter  | FileCheck %s
+// RUN: mlir-opt %s -transform-interpreter | FileCheck %s
 
   // CHECK: @optimize_shmem([[arg0:%.+]]: memref<{{.*}}>, [[readRow:%.+]]: index, [[readCol:%.+]]: index, [[writeRow:%.+]]: index, [[writeCol:%.+]]: index, [[fragRow:%.+]]: index, [[fragCol:%.+]]: index, [[fragColPerm:%.+]]: index, [[stRow:%.+]]: index, [[stCol:%.+]]: index)
-  func.func @optimize_shmem(%arg0: memref<4096x4096xf16>, 
+  func.func @optimize_shmem(%arg0: memref<4096x4096xf16>,
                     %readRow: index, %readCol: index,
                     %writeRow: index, %writeCol: index,
-                    %fragRow: index, %fragCol: index, 
+                    %fragRow: index, %fragCol: index,
                     %fragColPerm: index,
                     %stRow: index, %stCol: index) {
     %cst = arith.constant 0.000000e+00 : f16
@@ -13,33 +13,33 @@
     %shmB = memref.alloc() {alignment = 64 : i64} : memref<256x32xf16, 3>
 
     %0 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                  
-    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c7]]       
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]     
-    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]  
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
+    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]
     vector.transfer_write %0, %shmB[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<256x32xf16, 3>
     gpu.barrier
     gpu.barrier
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                     
-    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c7]]     
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]       
-    // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]] 
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
+    // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]]
     %1 = vector.load %shmB[%fragRow, %fragColPerm] : memref<256x32xf16, 3>, vector<8xf16>
     %2 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                  
-    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c7]]       
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]     
-    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]  
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
+    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]
     vector.transfer_write %2, %shmA[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x32xf16, 3>
     gpu.barrier
     gpu.barrier
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                     
-    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c7]]          
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                     
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]] 
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
     // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]]
     %3 = vector.load %shmA[%fragRow, %fragColPerm] : memref<128x32xf16, 3>, vector<8xf16>
     return
@@ -48,7 +48,7 @@
 module attributes { transform.with_named_sequence } {
   transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
-    transform.amdgpu.optimize_shared_memory_reads_and_writes %0 : (!transform.any_op) -> ()
+    transform.amdgpu.optimize_shared_memory_reads_and_writes %0 {sharedMemoryLineSizeBytes = 128, defaultVectorSizeBits = 128}: (!transform.any_op) -> ()
     transform.yield
   } // @__transform_main
 } // module
diff --git a/mlir/test/Dialect/Arith/expand-ops.mlir b/mlir/test/Dialect/Arith/expand-ops.mlir
index 046e8ff64fba..91f652e5a270 100644
--- a/mlir/test/Dialect/Arith/expand-ops.mlir
+++ b/mlir/test/Dialect/Arith/expand-ops.mlir
@@ -255,36 +255,21 @@ func.func @truncf_f32(%arg0 : f32) -> bf16 {
 }
 
 // CHECK-LABEL: @truncf_f32
-
-// CHECK-DAG: %[[C16:.+]] = arith.constant 16
-// CHECK-DAG: %[[C32768:.+]] = arith.constant 32768
-// CHECK-DAG: %[[C2130706432:.+]] = arith.constant 2130706432
-// CHECK-DAG: %[[C2139095040:.+]] = arith.constant 2139095040
-// CHECK-DAG: %[[C8388607:.+]] = arith.constant 8388607
-// CHECK-DAG: %[[C31:.+]] = arith.constant 31
-// CHECK-DAG: %[[C23:.+]] = arith.constant 23
-// CHECK-DAG: %[[BITCAST:.+]] = arith.bitcast %arg0
-// CHECK-DAG: %[[SIGN:.+]] = arith.shrui %[[BITCAST:.+]], %[[C31]]
-// CHECK-DAG: %[[ROUND:.+]] = arith.subi %[[C32768]], %[[SIGN]]
-// CHECK-DAG: %[[MANTISSA:.+]] = arith.andi %[[BITCAST]], %[[C8388607]]
-// CHECK-DAG: %[[ROUNDED:.+]] = arith.addi %[[MANTISSA]], %[[ROUND]]
-// CHECK-DAG: %[[ROLL:.+]] = arith.shrui %[[ROUNDED]], %[[C23]]
-// CHECK-DAG: %[[SHR:.+]] = arith.shrui %[[ROUNDED]], %[[ROLL]]
-// CHECK-DAG: %[[EXP:.+]] = arith.andi %0, %[[C2139095040]]
-// CHECK-DAG: %[[EXPROUND:.+]] = arith.addi %[[EXP]], %[[ROUNDED]]
-// CHECK-DAG: %[[EXPROLL:.+]] = arith.andi %[[EXPROUND]], %[[C2139095040]]
-// CHECK-DAG: %[[EXPMAX:.+]] = arith.cmpi uge, %[[EXP]], %[[C2130706432]]
-// CHECK-DAG: %[[EXPNEW:.+]] = arith.select %[[EXPMAX]], %[[EXP]], %[[EXPROLL]]
-// CHECK-DAG: %[[OVERFLOW_B:.+]] = arith.trunci %[[ROLL]]
-// CHECK-DAG: %[[KEEP_MAN:.+]] = arith.andi %[[EXPMAX]], %[[OVERFLOW_B]]
-// CHECK-DAG: %[[MANNEW:.+]] = arith.select %[[KEEP_MAN]], %[[MANTISSA]], %[[SHR]]
-// CHECK-DAG: %[[NEWSIGN:.+]] = arith.shli %[[SIGN]], %[[C31]]
-// CHECK-DAG: %[[WITHEXP:.+]] = arith.ori %[[NEWSIGN]], %[[EXPNEW]]
-// CHECK-DAG: %[[WITHMAN:.+]] = arith.ori %[[WITHEXP]], %[[MANNEW]]
-// CHECK-DAG: %[[SHIFT:.+]] = arith.shrui %[[WITHMAN]], %[[C16]]
-// CHECK-DAG: %[[TRUNC:.+]] = arith.trunci %[[SHIFT]]
-// CHECK-DAG: %[[RES:.+]] = arith.bitcast %[[TRUNC]]
-// CHECK: return %[[RES]]
+// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : i32
+// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : i32
+// CHECK-DAG: %[[C7FC0_i16:.+]] = arith.constant 32704 : i16
+// CHECK-DAG: %[[C7FFF:.+]] = arith.constant 32767 : i32
+// CHECK-DAG: %[[ISNAN:.+]] = arith.cmpf une, %arg0, %arg0 : f32
+// CHECK-DAG: %[[BITCAST:.+]] = arith.bitcast %arg0 : f32 to i32
+// CHECK-DAG: %[[SHRUI:.+]] = arith.shrui %[[BITCAST]], %[[C16]] : i32
+// CHECK-DAG: %[[BIT16:.+]] = arith.andi %[[SHRUI]], %[[C1]] : i32
+// CHECK-DAG: %[[ROUNDING_BIAS:.+]] = arith.addi %[[BIT16]], %[[C7FFF]] : i32
+// CHECK-DAG: %[[BIASED:.+]] = arith.addi %[[BITCAST]], %[[ROUNDING_BIAS]] : i32
+// CHECK-DAG: %[[BIASED_SHIFTED:.+]] = arith.shrui %[[BIASED]], %[[C16]] : i32
+// CHECK-DAG: %[[NORMAL_CASE_RESULT_i16:.+]] = arith.trunci %[[BIASED_SHIFTED]] : i32 to i16
+// CHECK-DAG: %[[SELECT:.+]] = arith.select %[[ISNAN]], %[[C7FC0_i16]], %[[NORMAL_CASE_RESULT_i16]] : i16
+// CHECK-DAG: %[[RESULT:.+]] = arith.bitcast %[[SELECT]] : i16 to bf16
+// CHECK: return %[[RESULT]]
 
 // -----
 
diff --git a/mlir/test/Dialect/Linalg/flatten-elementwise.mlir b/mlir/test/Dialect/Linalg/flatten-elementwise.mlir
new file mode 100644
index 000000000000..858c133dd536
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/flatten-elementwise.mlir
@@ -0,0 +1,99 @@
+// RUN: mlir-opt %s -transform-interpreter -split-input-file | FileCheck %s
+
+// CHECK-LABEL: func.func @fill(
+// CHECK-SAME:                  %[[ARG0:.*]]: f32,
+// CHECK-SAME:                  %[[ARG1:.*]]: memref<32x7xf32>
+// CHECK-NEXT:    %[[FLATTENED:.*]] = memref.collapse_shape %[[ARG1]] {{\[}}[0, 1]]
+// CHECK-NEXT:    linalg.fill ins(%[[ARG0]] : f32) outs(%[[FLATTENED]] : memref<224xf32>)
+func.func @fill(%cst: f32, %arg: memref<32x7xf32>) {
+    linalg.fill ins(%cst: f32) outs(%arg: memref<32x7xf32>)
+    return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match interface{LinalgOp} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %flattened = transform.structured.flatten_elementwise %0
+      : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func.func @fill_tensor(
+// CHECK-SAME:                         %[[ARG0:.*]]: f32,
+// CHECK-SAME:                         %[[ARG1:.*]]: tensor<32x7xf32>
+// CHECK-NEXT:    %[[FLATTENED:.*]] = tensor.collapse_shape %[[ARG1]] {{\[}}[0, 1]]
+// CHECK-NEXT:    %[[FLATTENED_RESULT:.*]] = linalg.fill ins(%[[ARG0]] : f32) outs(%[[FLATTENED]] : tensor<224xf32>)
+// CHECK-NEXT:    %[[RESULT:.*]] = tensor.expand_shape %[[FLATTENED_RESULT]] {{\[}}[0, 1]]
+func.func @fill_tensor(%cst: f32, %arg: tensor<32x7xf32>) -> tensor<32x7xf32> {
+    %0 = linalg.fill ins(%cst: f32) outs(%arg: tensor<32x7xf32>) ->  tensor<32x7xf32>
+    return %0 :  tensor<32x7xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match interface{LinalgOp} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %flattened = transform.structured.flatten_elementwise %0
+      : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func.func @map(
+// CHECK-SAME:                 %[[ARG0:[a-zA-Z0-9_]*]]: memref<32x7xf32>
+// CHECK-SAME:                 %[[ARG1:[a-zA-Z0-9_]*]]: memref<32x7xf32>
+// CHECK-SAME:                 %[[ARG2:[a-zA-Z0-9_]*]]: memref<32x7xf32>
+// CHECK-NEXT:    %[[FLATTENED_0:.*]] = memref.collapse_shape %[[ARG0]] {{\[}}[0, 1]]
+// CHECK-NEXT:    %[[FLATTENED_1:.*]] = memref.collapse_shape %[[ARG1]] {{\[}}[0, 1]]
+// CHECK-NEXT:    %[[FLATTENED_2:.*]] = memref.collapse_shape %[[ARG2]] {{\[}}[0, 1]]
+// CHECK-NEXT:    linalg.map { arith.addf } ins(%[[FLATTENED_0]], %[[FLATTENED_1]] : memref<224xf32>, memref<224xf32>) outs(%[[FLATTENED_2]] : memref<224xf32>)
+func.func @map(%arg0: memref<32x7xf32>, %arg1: memref<32x7xf32>, %arg2: memref<32x7xf32>) {
+    linalg.map {arith.addf} ins(%arg0, %arg1: memref<32x7xf32>, memref<32x7xf32>) outs(%arg2: memref<32x7xf32>)
+    return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match interface{LinalgOp} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %flattened = transform.structured.flatten_elementwise %0
+      : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK: #[[$MAP0:.*]] = affine_map<(d0) -> (d0)>
+// CHECK-LABEL: func.func @generic
+// CHECK-SAME:                 %[[ARG0:[a-zA-Z0-9_]*]]: memref<32x7xf32>
+// CHECK-SAME:                 %[[ARG1:[a-zA-Z0-9_]*]]: memref<32x7xf32>
+// CHECK-SAME:                 %[[ARG2:[a-zA-Z0-9_]*]]: memref<32x7xf32>
+// CHECK-NEXT:    %[[FLATTENED_0:.*]] = memref.collapse_shape %[[ARG0]] {{\[}}[0, 1]]
+// CHECK-NEXT:    %[[FLATTENED_1:.*]] = memref.collapse_shape %[[ARG1]] {{\[}}[0, 1]]
+// CHECK-NEXT:    %[[FLATTENED_2:.*]] = memref.collapse_shape %[[ARG2]] {{\[}}[0, 1]]
+// CHECK-NEXT:    linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP0]], #[[$MAP0]]], iterator_types = ["parallel"]} ins(%[[FLATTENED_0]], %[[FLATTENED_1]] : memref<224xf32>, memref<224xf32>) outs(%[[FLATTENED_2]] : memref<224xf32>)
+// CHECK-NEXT:       ^bb0(%[[A:.*]]: f32, %[[B:.*]]: f32, %[[C:.*]]: f32)
+// CHECK-NEXT:         %[[SUM:.*]] = arith.addf %[[A]], %[[B]]
+// CHECK-NEXT:         linalg.yield %[[SUM]]
+#map = affine_map<(d0, d1) -> (d0, d1)>
+func.func @generic( %arg0: memref<32x7xf32>, %arg1: memref<32x7xf32>, %arg2: memref<32x7xf32>) {
+    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1: memref<32x7xf32>, memref<32x7xf32>) outs(%arg2: memref<32x7xf32>) {
+        ^bb0(%a: f32, %b: f32, %c: f32):
+            %0 = arith.addf %a, %b : f32
+            linalg.yield %0 : f32
+    }
+    return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match interface{LinalgOp} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %flattened = transform.structured.flatten_elementwise %0
+      : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
index 96953c234a08..85e1c56dd45a 100644
--- a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
+++ b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
@@ -550,3 +550,48 @@ module attributes {transform.with_named_sequence} {
      transform.yield
    }
 }
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0 + d1 + d2)>
+func.func @vectorize_reverse_like_tensor_extract(%arg0: tensor<1x2x3xf32>, %arg1: tensor<1x1x3xf32>, %arg2: index) -> tensor<1x1x3xf32> {
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %0 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel", "parallel"]} outs(%arg1 : tensor<1x1x3xf32>) {
+  ^bb0(%out: f32):
+    %1 = linalg.index 1 : index
+    %2 = linalg.index 0 : index
+    %3 = affine.apply #map1(%1, %2, %arg2)
+    %4 = linalg.index 2 : index
+    %5 = arith.subi %c2, %4 : index
+    %extracted = tensor.extract %arg0[%c0, %3, %5] : tensor<1x2x3xf32>
+    linalg.yield %extracted : f32
+  } -> tensor<1x1x3xf32>
+  return %0 : tensor<1x1x3xf32>
+}
+// CHECK-LABEL: func.func @vectorize_reverse_like_tensor_extract
+// CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]*]]
+// CHECK-SAME:    %[[ARG1:[0-9a-zA-Z]*]]
+// CHECK-SAME:    %[[ARG2:[0-9a-zA-Z]*]]
+// CHECK-DAG:    %[[CST:.+]] = arith.constant dense<3> : vector<1x1x3xindex>
+// CHECK-DAG:    %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:    %[[MASK:.*]] = arith.constant dense<true> : vector<1x1x3xi1>
+// CHECK-DAG:    %[[PASSTHRU:.*]] = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
+// CHECK-DAG:    %[[INIT_IDX:.+]] = arith.constant dense<[2, 1, 0]> : vector<3xindex>
+// CHECK:        %[[T0:.+]] = vector.broadcast %[[ARG2]] : index to vector<1x1x3xindex>
+// CHECK:        %[[T1:.+]] = arith.muli %[[T0]], %[[CST]] : vector<1x1x3xindex>
+// CHECK:        %[[T2:.+]] = vector.broadcast %[[INIT_IDX]]
+// CHECK:        %[[T3:.+]] = arith.addi %[[T2]], %[[T1]]
+// CHECK:        %[[GATHER:.*]] = vector.gather %[[ARG0]][%[[C0]], %[[C0]], %[[C0]]] [%[[T3]]], %[[MASK]], %[[PASSTHRU]]
+// CHECK:        vector.transfer_write %[[GATHER]]
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+     %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+     %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_nd_extract } : (!transform.any_op) -> !transform.any_op
+     transform.yield
+   }
+}
diff --git a/mlir/test/Dialect/SparseTensor/codegen.mlir b/mlir/test/Dialect/SparseTensor/codegen.mlir
index c1a976c84fec..b63762485c96 100644
--- a/mlir/test/Dialect/SparseTensor/codegen.mlir
+++ b/mlir/test/Dialect/SparseTensor/codegen.mlir
@@ -34,6 +34,10 @@
   map = (d0, d1) -> (d1 : dense, d0 : compressed)
 }>
 
+#BCSR = #sparse_tensor.encoding<{
+  map = (d0, d1, d2, d3) -> (d0: batch, d1: batch, d2 : dense, d3 : compressed)
+}>
+
 #DCSR = #sparse_tensor.encoding<{
   map = (d0, d1) -> (d0 : compressed, d1 : compressed),
   crdWidth = 64,
@@ -182,6 +186,36 @@ func.func @sparse_csr(%arg0: tensor<?x?xf64, #CSR>) {
   return
 }
 
+// CHECK-LABEL: func @sparse_bcsr_0(
+//  CHECK-SAME: %[[A1:.*0]]: memref<?x2x?xindex>,
+//  CHECK-SAME: %[[A2:.*1]]: memref<?x2x?xindex>,
+//  CHECK-SAME: %[[A3:.*]]: memref<?x2x?xf64>,
+//  CHECK-SAME: %[[A4:.*]]: !sparse_tensor.storage_specifier
+//       CHECK: return
+func.func @sparse_bcsr_0(%arg0: tensor<?x2x?x?xf64, #BCSR>) {
+  return
+}
+
+// CHECK-LABEL: func @sparse_bcsr_1(
+//  CHECK-SAME: %[[A1:.*0]]: memref<?x?x?xindex>,
+//  CHECK-SAME: %[[A2:.*1]]: memref<?x?x?xindex>,
+//  CHECK-SAME: %[[A3:.*]]: memref<?x?x?xf64>,
+//  CHECK-SAME: %[[A4:.*]]: !sparse_tensor.storage_specifier
+//       CHECK: return
+func.func @sparse_bcsr_1(%arg0: tensor<?x?x?x?xf64, #BCSR>) {
+  return
+}
+
+// CHECK-LABEL: func @sparse_bcsr_2(
+//  CHECK-SAME: %[[A1:.*0]]: memref<18x6x?xindex>,
+//  CHECK-SAME: %[[A2:.*1]]: memref<18x6x?xindex>,
+//  CHECK-SAME: %[[A3:.*]]: memref<18x6x?xf64>,
+//  CHECK-SAME: %[[A4:.*]]: !sparse_tensor.storage_specifier
+//       CHECK: return
+func.func @sparse_bcsr_2(%arg0: tensor<18x6x4x2xf64, #BCSR>) {
+  return
+}
+
 // CHECK-LABEL: func @sparse_dcsr(
 //  CHECK-SAME: %[[A0:.*0]]: memref<?xi32>,
 //  CHECK-SAME: %[[A1:.*1]]: memref<?xi64>,
diff --git a/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir b/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir
index 9ed3cee25914..8096c010ac93 100644
--- a/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir
+++ b/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir
@@ -54,6 +54,12 @@ func.func private @tensor_dimlevel_size_mismatch(%arg0: tensor<8xi32, #a>) -> ()
 
 // -----
 
+// expected-error@+1 {{Batch lvlType can only be leading levels}}
+#a = #sparse_tensor.encoding<{map = (d0, d1, d2) -> (d0 : batch, d1 : compressed, d2: batch)}>
+func.func private @non_leading_batch(%arg0: tensor<?x?x?i32, #a>) -> ()
+
+// -----
+
 // expected-error@+1 {{use of undeclared identifier}}
 #a = #sparse_tensor.encoding<{map = (d0) -> (d0 : dense, d1 : compressed)}>
 func.func private @tensor_sizes_mismatch(%arg0: tensor<8xi32, #a>) -> ()
diff --git a/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir b/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir
index 9d5118ceecc5..66e61afd897d 100644
--- a/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir
+++ b/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir
@@ -22,6 +22,17 @@ func.func private @sparse_csr(tensor<?x?xf32, #CSR>)
 
 // -----
 
+#BCSR = #sparse_tensor.encoding<{
+  map = (d0, d1, d2) -> (d0 : batch, d1: dense, d2 : compressed),
+}>
+
+// CHECK: #[[$BCSR:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : batch, d1 : dense, d2 : compressed) }>
+// CHECK-LABEL: func private @sparse_bcsr(
+// CHECK-SAME: tensor<?x?x?xf32, #[[$BCSR]]>)
+func.func private @sparse_bcsr(tensor<?x?x?xf32, #BCSR>)
+
+// -----
+
 #CSR_explicit = #sparse_tensor.encoding<{
   map = {l0, l1} (d0 = l0, d1 = l1) -> (l0 = d0 : dense, l1 = d1 : compressed)
 }>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir b/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir
index d04fbe8ed5c2..6e8a26762d90 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir
@@ -14,7 +14,7 @@
 // CHECK-DAG:       %[[VAL_8:.*]] = arith.constant true
 // CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 100 : index
 // CHECK-DAG:       %[[VAL_10:.*]] = arith.constant 300 : index
-// CHECK-DAG:       %[[VAL_11:.*]] = arith.constant 131072 : i64
+// CHECK-DAG:       %[[VAL_11:.*]] = arith.constant 262144 : i64
 // CHECK:           %[[VAL_12:.*]] = memref.alloca() : memref<2xi64>
 // CHECK:           %[[VAL_13:.*]] = memref.cast %[[VAL_12]] : memref<2xi64> to memref<?xi64>
 // CHECK:           memref.store %[[VAL_11]], %[[VAL_12]]{{\[}}%[[VAL_5]]] : memref<2xi64>
diff --git a/mlir/test/Dialect/Transform/include/test-interpreter-external-concurrent-source.mlir b/mlir/test/Dialect/Transform/include/test-interpreter-external-concurrent-source.mlir
index 316b90f85236..255ff5f31ed3 100644
--- a/mlir/test/Dialect/Transform/include/test-interpreter-external-concurrent-source.mlir
+++ b/mlir/test/Dialect/Transform/include/test-interpreter-external-concurrent-source.mlir
@@ -1,16 +1,21 @@
 // RUN: mlir-opt %s
 // No need to check anything else than parsing here, this is being used by another test as data.
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @func_return : benefit(1) {
-    %0 = pdl.operation "func.return"
-    pdl.rewrite %0 with "transform.dialect"
-  }
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @func_return : benefit(1) {
+        %0 = pdl.operation "func.return"
+        pdl.rewrite %0 with "transform.dialect"
+      }
 
-  sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
-    %0 = pdl_match @func_return in %arg1 : (!transform.any_op) -> !transform.op<"func.return">
-    transform.debug.emit_remark_at %0, "matched" : !transform.op<"func.return">
+      sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb1(%arg1: !transform.any_op):
+        %0 = pdl_match @func_return in %arg1 : (!transform.any_op) -> !transform.op<"func.return">
+        transform.debug.emit_remark_at %0, "matched" : !transform.op<"func.return">
+      }
+    }
+    transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Transform/include/test-interpreter-external-source.mlir b/mlir/test/Dialect/Transform/include/test-interpreter-external-source.mlir
index 5956c86ebbe4..f6b7f787cc2c 100644
--- a/mlir/test/Dialect/Transform/include/test-interpreter-external-source.mlir
+++ b/mlir/test/Dialect/Transform/include/test-interpreter-external-source.mlir
@@ -1,11 +1,13 @@
 // RUN: mlir-opt %s
 // No need to check anything else than parsing here, this is being used by another test as data.
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  transform.debug.emit_remark_at %arg0, "outer" : !transform.any_op
-  transform.sequence %arg0 : !transform.any_op failures(propagate) attributes {transform.target_tag="transform"} {
-  ^bb1(%arg1: !transform.any_op):
-    transform.debug.emit_remark_at %arg1, "inner" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.debug.emit_remark_at %arg0, "outer" : !transform.any_op
+    transform.sequence %arg0 : !transform.any_op failures(propagate) attributes {transform.target_tag="transform"} {
+    ^bb1(%arg1: !transform.any_op):
+      transform.debug.emit_remark_at %arg1, "inner" : !transform.any_op
+    }
+    transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Transform/multi-arg-top-level-ops.mlir b/mlir/test/Dialect/Transform/multi-arg-top-level-ops.mlir
index 9a7e7ca2f953..1c018b1b1f77 100644
--- a/mlir/test/Dialect/Transform/multi-arg-top-level-ops.mlir
+++ b/mlir/test/Dialect/Transform/multi-arg-top-level-ops.mlir
@@ -1,10 +1,15 @@
-// RUN: mlir-opt %s --pass-pipeline='builtin.module(test-transform-dialect-interpreter{bind-first-extra-to-ops=func.func bind-second-extra-to-ops=func.return})' \
-// RUN:             --split-input-file --verify-diagnostics
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(transform-interpreter{\
+// RUN:       debug-bind-trailing-args=func.func,func.return})" \
+// RUN:   --split-input-file --verify-diagnostics
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_op, %arg2: !transform.any_op):
-  transform.debug.emit_remark_at %arg1, "first extra" : !transform.any_op
-  transform.debug.emit_remark_at %arg2, "second extra" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.any_op,
+      %arg2: !transform.any_op) {
+    transform.debug.emit_remark_at %arg1, "first extra" : !transform.any_op
+    transform.debug.emit_remark_at %arg2, "second extra" : !transform.any_op
+    transform.yield
+  }
 }
 
 // expected-remark @below {{first extra}}
@@ -26,9 +31,13 @@ func.func @bar(%arg0: i1) {
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_op, %arg2: !transform.param<i64>):
-  // expected-error @above {{wrong kind of value provided for top-level parameter}}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.any_op,
+      %arg2: !transform.param<i64>) {
+    // expected-error @above {{wrong kind of value provided for top-level parameter}}
+    transform.yield
+  }
 }
 
 func.func @foo() {
@@ -37,9 +46,13 @@ func.func @foo() {
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_op, %arg2: !transform.any_value):
-  // expected-error @above {{wrong kind of value provided for the top-level value handle}}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.any_op,
+      %arg2: !transform.any_value) {
+    // expected-error @above {{wrong kind of value provided for the top-level value handle}}
+    transform.yield
+  }
 }
 
 func.func @foo() {
@@ -48,19 +61,27 @@ func.func @foo() {
 
 // -----
 
-// expected-error @below {{operation expects 1 extra value bindings, but 2 were provided to the interpreter}}
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_op):
+
+module attributes {transform.with_named_sequence} {
+  // expected-error @below {{operation expects 1 extra value bindings, but 2 were provided to the interpreter}}
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.any_op) {
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_op, %arg2: !transform.any_op):
-  transform.sequence %arg0, %arg1, %arg2 : !transform.any_op, !transform.any_op, !transform.any_op failures(propagate) {
-  ^bb0(%arg3: !transform.any_op, %arg4: !transform.any_op, %arg5: !transform.any_op):
-    transform.debug.emit_remark_at %arg4, "first extra" : !transform.any_op
-    transform.debug.emit_remark_at %arg5, "second extra" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.any_op,
+      %arg2: !transform.any_op) {
+    transform.sequence %arg0, %arg1, %arg2 : !transform.any_op, !transform.any_op, !transform.any_op failures(propagate) {
+    ^bb0(%arg3: !transform.any_op, %arg4: !transform.any_op, %arg5: !transform.any_op):
+      transform.debug.emit_remark_at %arg4, "first extra" : !transform.any_op
+      transform.debug.emit_remark_at %arg5, "second extra" : !transform.any_op
+    }
+    transform.yield
   }
 }
 
diff --git a/mlir/test/Dialect/Transform/multi-arg-top-level-params.mlir b/mlir/test/Dialect/Transform/multi-arg-top-level-params.mlir
index f59a4b6d4ccc..6486bcae3294 100644
--- a/mlir/test/Dialect/Transform/multi-arg-top-level-params.mlir
+++ b/mlir/test/Dialect/Transform/multi-arg-top-level-params.mlir
@@ -1,24 +1,37 @@
-// RUN: mlir-opt %s --pass-pipeline='builtin.module(test-transform-dialect-interpreter{bind-first-extra-to-params=1,2,3 bind-second-extra-to-params=42,45})' \
+// RUN: mlir-opt %s --pass-pipeline='builtin.module(transform-interpreter{\
+// RUN:                             debug-bind-trailing-args=#1;2;3,#42;45})' \
 // RUN:          --split-input-file --verify-diagnostics
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.param<i64>, %arg2: !transform.param<i64>):
-  // expected-remark @below {{1 : i64, 2 : i64, 3 : i64}}
-  transform.debug.emit_param_as_remark %arg1 : !transform.param<i64>
-  // expected-remark @below {{42 : i64, 45 : i64}}
-  transform.debug.emit_param_as_remark %arg2 : !transform.param<i64>
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.param<i64>,
+      %arg2: !transform.param<i64>) {
+    // expected-remark @below {{1 : i64, 2 : i64, 3 : i64}}
+    transform.debug.emit_param_as_remark %arg1 : !transform.param<i64>
+    // expected-remark @below {{42 : i64, 45 : i64}}
+    transform.debug.emit_param_as_remark %arg2 : !transform.param<i64>
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_op, %arg2: !transform.param<i64>):
-  // expected-error @above {{wrong kind of value provided for top-level operation handle}}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.any_op,
+      // expected-error @above {{wrong kind of value provided for top-level operation handle}}
+      %arg2: !transform.param<i64>) {
+    transform.yield
+  }
 }
 
 // -----
 
-// expected-error @below {{operation expects 3 extra value bindings, but 2 were provided to the interpreter}}
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.param<i64>, %arg2: !transform.param<i64>, %arg3: !transform.param<i64>):
+module attributes {transform.with_named_sequence} {
+  // expected-error @below {{operation expects 3 extra value bindings, but 2 were provided to the interpreter}}
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.param<i64>,
+      %arg2: !transform.param<i64>, %arg3: !transform.param<i64>) {
+    transform.yield
+  }
 }
diff --git a/mlir/test/Dialect/Transform/multi-arg-top-level-values.mlir b/mlir/test/Dialect/Transform/multi-arg-top-level-values.mlir
index 38d7e2869777..dcc1079267dc 100644
--- a/mlir/test/Dialect/Transform/multi-arg-top-level-values.mlir
+++ b/mlir/test/Dialect/Transform/multi-arg-top-level-values.mlir
@@ -1,4 +1,5 @@
-// RUN: mlir-opt %s --pass-pipeline='builtin.module(test-transform-dialect-interpreter{bind-first-extra-to-results-of-ops=test.some_returning_op bind-second-extra-to-results-of-ops=test.some_other_returning_op})' \
+// RUN: mlir-opt %s --pass-pipeline='builtin.module(transform-interpreter{\
+// RUN:     debug-bind-trailing-args=^test.some_returning_op,^test.some_other_returning_op})' \
 // RUN:             --split-input-file --verify-diagnostics
 
 // Note that diagnostic checker will merge two diagnostics with the same message
@@ -21,10 +22,14 @@
 // expected-note @below {{value handle points to an op result #1}}
 %2:2 = "test.some_other_returning_op"() : () -> (f32, f64)
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_value, %arg2: !transform.any_value):
-  transform.debug.emit_remark_at %arg1, "first extra" : !transform.any_value
-  transform.debug.emit_remark_at %arg2, "second extra" : !transform.any_value
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.any_value,
+      %arg2: !transform.any_value) {
+    transform.debug.emit_remark_at %arg1, "first extra" : !transform.any_value
+    transform.debug.emit_remark_at %arg2, "second extra" : !transform.any_value
+    transform.yield
+  }
 }
 
 // -----
@@ -32,14 +37,19 @@ transform.sequence failures(propagate) {
 %0:2 = "test.some_returning_op"() : () -> (i32, i64)
 %1 = "test.some_returning_op"() : () -> index
 
-transform.sequence failures(propagate) {
-// expected-error @below {{wrong kind of value provided for top-level operation handle}}
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_op, %arg2: !transform.any_value):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      // expected-error @below {{wrong kind of value provided for top-level operation handle}}
+      %arg0: !transform.any_op, %arg1: !transform.any_op, %arg2: !transform.any_value) {
+    transform.yield
+  }
 }
 
 // -----
 
-// expected-error @below {{operation expects 1 extra value bindings, but 2 were provided to the interpreter}}
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_value):
+module attributes {transform.with_named_sequence} {
+  // expected-error @below {{operation expects 1 extra value bindings, but 2 were provided to the interpreter}}
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op, %arg1: !transform.any_value) {
+    transform.yield
+  }
 }
diff --git a/mlir/test/Dialect/Transform/test-interpreter-debug.mlir b/mlir/test/Dialect/Transform/test-interpreter-debug.mlir
index c7dad582dd43..99301ea23c6f 100644
--- a/mlir/test/Dialect/Transform/test-interpreter-debug.mlir
+++ b/mlir/test/Dialect/Transform/test-interpreter-debug.mlir
@@ -1,19 +1,21 @@
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(test-transform-dialect-interpreter{debug-payload-root-tag=payload debug-transform-root-tag=transform})" \
-// RUN:             --allow-unregistered-dialect --split-input-file --verify-diagnostics
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(transform-interpreter{\
+// RUN:         debug-payload-root-tag=payload \
+// RUN:         entry-point=transform})" \
+// RUN:   --allow-unregistered-dialect --split-input-file --verify-diagnostics
 
 // expected-error @below {{could not find the operation with transform.target_tag="payload" attribute}}
-module {
-  transform.sequence failures(suppress) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @transform(%arg0: !transform.any_op) {
+    transform.yield
   }
 }
 
 // -----
 
-// expected-error @below {{could not find the operation with transform.target_tag="transform" attribute}}
-module {
-  transform.sequence failures(suppress) {
-  ^bb0(%arg0: !transform.any_op):
+// expected-error @below {{could not find a nested named sequence with name: transform}}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @not_transform(%arg0: !transform.any_op) {
+    transform.yield
   }
 
   module attributes {transform.target_tag="payload"} {}
@@ -21,42 +23,16 @@ module {
 
 // -----
 
-// expected-error @below {{more than one operation with transform.target_tag="transform" attribute}}
-module {
-  // expected-note @below {{first operation}}
-  transform.sequence failures(propagate) attributes {transform.target_tag="transform"} {
-  ^bb0(%arg0: !transform.any_op):
-  }
-
-  // expected-note @below {{other operation}}
-  transform.sequence failures(propagate) attributes {transform.target_tag="transform"} {
-  ^bb0(%arg0: !transform.any_op):
-  }
-
-  module attributes {transform.target_tag="payload"} {}
-}
-
-// -----
-
-module {
-  // expected-error @below {{expected the transform entry point to be a top-level transform op}}
-  func.func private @foo() attributes {transform.target_tag="transform"}
-
-  module attributes {transform.target_tag="payload"} {}
-}
-
-// -----
-
-module {
-  transform.sequence failures(suppress) attributes {transform.target_tag="transform"} {
-  ^bb0(%arg0: !transform.any_op):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @transform(%arg0: !transform.any_op) {
     transform.debug.emit_remark_at %arg0, "payload" : !transform.any_op
+    transform.yield
   }
 
-  // This will not be executed because it's not tagged.
-  transform.sequence failures(suppress)  {
-  ^bb0(%arg0: !transform.any_op):
+  // This will not be executed.
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
     transform.debug.emit_remark_at %arg0, "some other text that is not printed" : !transform.any_op
+    transform.yield
   }
 
   module {
diff --git a/mlir/test/Dialect/Transform/test-interpreter-external-concurrent.mlir b/mlir/test/Dialect/Transform/test-interpreter-external-concurrent.mlir
index 59c2b672a6e6..9884102c6c0f 100644
--- a/mlir/test/Dialect/Transform/test-interpreter-external-concurrent.mlir
+++ b/mlir/test/Dialect/Transform/test-interpreter-external-concurrent.mlir
@@ -1,4 +1,6 @@
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(test-transform-dialect-interpreter{transform-file-name=%p%{fs-sep}include%{fs-sep}test-interpreter-external-concurrent-source.mlir}))" \
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(\
+// RUN:     transform-preload-library{transform-library-paths=%p%{fs-sep}include%{fs-sep}test-interpreter-external-concurrent-source.mlir},\
+// RUN:     func.func(transform-interpreter))" \
 // RUN:             --verify-diagnostics
 
 // Exercising the pass on multiple functions of different lengths that may be
diff --git a/mlir/test/Dialect/Transform/test-interpreter-external.mlir b/mlir/test/Dialect/Transform/test-interpreter-external.mlir
index ba8e0c6870db..599ce05fcc40 100644
--- a/mlir/test/Dialect/Transform/test-interpreter-external.mlir
+++ b/mlir/test/Dialect/Transform/test-interpreter-external.mlir
@@ -1,4 +1,6 @@
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(test-transform-dialect-interpreter{transform-file-name=%p%{fs-sep}include%{fs-sep}test-interpreter-external-source.mlir})" \
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(\
+// RUN:                 transform-preload-library{transform-library-paths=%p%{fs-sep}include%{fs-sep}test-interpreter-external-source.mlir},\
+// RUN:                 transform-interpreter)" \
 // RUN:             --verify-diagnostics
 
 // The schedule in the separate file emits remarks at the payload root.
diff --git a/mlir/test/Dialect/Transform/test-interpreter.mlir b/mlir/test/Dialect/Transform/test-interpreter.mlir
index de5807b2874b..b6850e2024d5 100644
--- a/mlir/test/Dialect/Transform/test-interpreter.mlir
+++ b/mlir/test/Dialect/Transform/test-interpreter.mlir
@@ -1411,7 +1411,6 @@ module attributes {transform.with_named_sequence} {
 // -----
 
 // expected-error @below {{could not find a nested named sequence with name: __transform_main}}
-// expected-error @below {{could not find transform entry point: __transform_main in either payload or transform module}}
 module {
 }
 
diff --git a/mlir/test/Dialect/Transform/test-pass-application.mlir b/mlir/test/Dialect/Transform/test-pass-application.mlir
index 65625457c868..7cb5387b937d 100644
--- a/mlir/test/Dialect/Transform/test-pass-application.mlir
+++ b/mlir/test/Dialect/Transform/test-pass-application.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --test-transform-dialect-interpreter -allow-unregistered-dialect --split-input-file --verify-diagnostics | FileCheck %s
+// RUN: mlir-opt %s --transform-interpreter -allow-unregistered-dialect --split-input-file --verify-diagnostics | FileCheck %s
 
 // CHECK-LABEL: func @successful_pass_application(
 //       CHECK:   %[[c5:.*]] = arith.constant 5 : index
@@ -9,10 +9,12 @@ func.func @successful_pass_application(%t: tensor<5xf32>) -> index {
   return %dim : index
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_registered_pass "canonicalize" to %1 : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "canonicalize" to %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -22,12 +24,14 @@ func.func @pass_pipeline() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // This pipeline does not do anything. Just make sure that the pipeline is
-  // found and no error is produced.
-  transform.apply_registered_pass "test-options-pass-pipeline" to %1 : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // This pipeline does not do anything. Just make sure that the pipeline is
+    // found and no error is produced.
+    transform.apply_registered_pass "test-options-pass-pipeline" to %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -36,11 +40,13 @@ func.func @invalid_pass_name() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-error @below {{unknown pass or pass pipeline: non-existing-pass}}
-  transform.apply_registered_pass "non-existing-pass" to %1 : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @below {{unknown pass or pass pipeline: non-existing-pass}}
+    transform.apply_registered_pass "non-existing-pass" to %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -53,11 +59,13 @@ func.func @not_isolated_from_above(%t: tensor<5xf32>) -> index {
   return %dim : index
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %1 = transform.structured.match ops{["tensor.dim"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-error @below {{pass pipeline failed}}
-  transform.apply_registered_pass "canonicalize" to %1 : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["tensor.dim"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @below {{pass pipeline failed}}
+    transform.apply_registered_pass "canonicalize" to %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -66,11 +74,13 @@ func.func @invalid_pass_option() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-error @below {{failed to add pass or pass pipeline to pipeline: canonicalize}}
-  transform.apply_registered_pass "canonicalize" to %1 {options = "invalid-option=1"} : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @below {{failed to add pass or pass pipeline to pipeline: canonicalize}}
+    transform.apply_registered_pass "canonicalize" to %1 {options = "invalid-option=1"} : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -80,27 +90,29 @@ func.func @valid_pass_option() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_registered_pass "canonicalize" to %1 {options = "top-down=false"} : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "canonicalize" to %1 {options = "top-down=false"} : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-module {
+module attributes {transform.with_named_sequence} {
   // expected-error @below {{trying to schedule a pass on an unsupported operation}}
   // expected-note @below {{target op}}
   func.func @invalid_target_op_type() {
     return
   }
 
-  transform.sequence failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
 
     // func-bufferize can be applied only to ModuleOps.
     // expected-error @below {{pass pipeline failed}}
     transform.apply_registered_pass "func-bufferize" to %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Transform/test-pattern-application.mlir b/mlir/test/Dialect/Transform/test-pattern-application.mlir
index 10cd9ef351fe..0c41e81b17b5 100644
--- a/mlir/test/Dialect/Transform/test-pattern-application.mlir
+++ b/mlir/test/Dialect/Transform/test-pattern-application.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --test-transform-dialect-interpreter -allow-unregistered-dialect --split-input-file --verify-diagnostics | FileCheck %s
+// RUN: mlir-opt %s --transform-interpreter -allow-unregistered-dialect --split-input-file --verify-diagnostics | FileCheck %s
 
 // CHECK-LABEL: func @update_tracked_op_mapping()
 //       CHECK:   "test.container"() ({
@@ -11,15 +11,17 @@ func.func @update_tracked_op_mapping() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.transform.test_patterns
-  } : !transform.any_op
-  // Add an attribute to %1, which is now mapped to a new op.
-  transform.annotate %1 "annotated" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %0 {
+      transform.apply_patterns.transform.test_patterns
+    } : !transform.any_op
+    // Add an attribute to %1, which is now mapped to a new op.
+    transform.annotate %1 "annotated" : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -33,19 +35,21 @@ func.func @replacement_op_not_found() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-note @below {{replacement is required because this handle must be updated}}
-  %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-error @below {{tracking listener failed to find replacement op during application of this transform op}}
-  // expected-note @below {{ran out of suitable replacement values}}
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.transform.test_patterns
-  } : !transform.any_op
-  // %1 must be used in some way. If no replacement payload op could be found,
-  // an error is thrown only if the handle is not dead.
-  transform.annotate %1 "annotated" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-note @below {{replacement is required because this handle must be updated}}
+    %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @below {{tracking listener failed to find replacement op during application of this transform op}}
+    // expected-note @below {{ran out of suitable replacement values}}
+    transform.apply_patterns to %0 {
+      transform.apply_patterns.transform.test_patterns
+    } : !transform.any_op
+    // %1 must be used in some way. If no replacement payload op could be found,
+    // an error is thrown only if the handle is not dead.
+    transform.annotate %1 "annotated" : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -61,14 +65,16 @@ func.func @replacement_op_for_dead_handle_not_found() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // No error because %1 is dead.
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.transform.test_patterns
-  } : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // No error because %1 is dead.
+    transform.apply_patterns to %0 {
+      transform.apply_patterns.transform.test_patterns
+    } : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -84,14 +90,16 @@ func.func @replacement_op_not_found_silenced() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.transform.test_patterns
-  } {transform.silence_tracking_failures} : !transform.any_op
-  transform.annotate %1 "annotated" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %0 {
+      transform.apply_patterns.transform.test_patterns
+    } {transform.silence_tracking_failures} : !transform.any_op
+    transform.annotate %1 "annotated" : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -103,12 +111,14 @@ func.func @patterns_apply_only_to_target_body() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-%0 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.transform.test_patterns
-  } : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+  %0 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %0 {
+      transform.apply_patterns.transform.test_patterns
+    } : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -125,16 +135,18 @@ func.func @erase_tracked_op() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match ops{["test.erase_op"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.debug.emit_remark_at %1, "matched op" : !transform.any_op
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.transform.test_patterns
-  } : !transform.any_op
-  // No marker should be printed.
-  transform.debug.emit_remark_at %1, "op was deleted" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["test.erase_op"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.debug.emit_remark_at %1, "matched op" : !transform.any_op
+    transform.apply_patterns to %0 {
+      transform.apply_patterns.transform.test_patterns
+    } : !transform.any_op
+    // No marker should be printed.
+    transform.debug.emit_remark_at %1, "op was deleted" : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -143,7 +155,7 @@ transform.sequence failures(propagate) {
 //       CHECK:   "test.container"() ({
 //  CHECK-NEXT:   ^bb0:
 //  CHECK-NEXT:   }) : () -> ()
-module {
+module attributes {transform.with_named_sequence} {
   func.func @erase_tracked_op_in_named_sequence() {
     "test.container"() ({
       // expected-remark @below {{matched op}}
@@ -152,23 +164,21 @@ module {
     return
   }
 
-  module attributes { transform.with_named_sequence } {
-    transform.named_sequence @foo(%arg0: !transform.any_op {transform.readonly}) -> () {
-      transform.apply_patterns to %arg0 {
-        transform.apply_patterns.transform.test_patterns
-      } : !transform.any_op
-      transform.yield
-    }
+  transform.named_sequence @foo(%arg0: !transform.any_op {transform.readonly}) -> () {
+    transform.apply_patterns to %arg0 {
+      transform.apply_patterns.transform.test_patterns
+    } : !transform.any_op
+    transform.yield
+  }
 
-    transform.sequence failures(propagate) {
-    ^bb1(%arg1: !transform.any_op):
-      %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1 = transform.structured.match ops{["test.erase_op"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      transform.debug.emit_remark_at %1, "matched op" : !transform.any_op
-      include @foo failures(propagate) (%0) : (!transform.any_op) -> ()
-      // No marker should be printed.
-      transform.debug.emit_remark_at %1, "op was deleted" : !transform.any_op
-    }
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["test.erase_op"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.debug.emit_remark_at %1, "matched op" : !transform.any_op
+    transform.include @foo failures(propagate) (%0) : (!transform.any_op) -> ()
+    // No marker should be printed.
+    transform.debug.emit_remark_at %1, "op was deleted" : !transform.any_op
+    transform.yield
   }
 }
 
@@ -183,13 +193,15 @@ func.func @canonicalization(%t: tensor<5xf32>) -> index {
   return %dim : index
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["tensor.dim"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %1 {
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["tensor.dim"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %1 {
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -200,13 +212,13 @@ module {
     return
   }
 
-  module {
-    transform.sequence failures(propagate) {
-    ^bb1(%arg1: !transform.any_op):
+  module attributes {transform.with_named_sequence} {
+    transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
       // expected-error @below {{cannot apply transform to itself (or one of its ancestors)}}
       transform.apply_patterns to %arg1 {
         transform.apply_patterns.canonicalization
       } : !transform.any_op
+      transform.yield
     }
   }
 }
@@ -224,12 +236,14 @@ func.func @canonicalization_and_cse(%m: memref<5xf32>) {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %1 {
-    transform.apply_patterns.canonicalization
-  } {apply_cse} : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %1 {
+      transform.apply_patterns.canonicalization
+    } {apply_cse} : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -243,15 +257,17 @@ func.func @full_dialect_conversion() -> tensor<5xf32> {
   return %0 : tensor<5xf32>
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_conversion_patterns to %0 {
-    transform.apply_conversion_patterns.transform.test_conversion_patterns
-  } with type_converter {
-    transform.apply_conversion_patterns.transform.test_type_converter
-  } {legal_ops = ["func.func", "func.return", "test.new_op"]}
-      : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_conversion_patterns to %0 {
+      transform.apply_conversion_patterns.transform.test_conversion_patterns
+    } with type_converter {
+      transform.apply_conversion_patterns.transform.test_type_converter
+    } {legal_ops = ["func.func", "func.return", "test.new_op"]}
+        : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -266,16 +282,18 @@ func.func @full_dialect_conversion_failed() -> tensor<5xf32> {
   return %0 : tensor<5xf32>
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-error @below{{dialect conversion failed}}
-  transform.apply_conversion_patterns to %0 {
-    transform.apply_conversion_patterns.transform.test_conversion_patterns
-  } with type_converter {
-    transform.apply_conversion_patterns.transform.test_type_converter
-  } {legal_ops = ["func.func", "func.return", "test.new_op"]}
-      : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @below{{dialect conversion failed}}
+    transform.apply_conversion_patterns to %0 {
+      transform.apply_conversion_patterns.transform.test_conversion_patterns
+    } with type_converter {
+      transform.apply_conversion_patterns.transform.test_type_converter
+    } {legal_ops = ["func.func", "func.return", "test.new_op"]}
+        : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -294,98 +312,108 @@ func.func @partial_dialect_conversion() -> tensor<5xf32> {
   return %0 : tensor<5xf32>
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_conversion_patterns to %0 {
-    transform.apply_conversion_patterns.transform.test_conversion_patterns
-  } with type_converter {
-    transform.apply_conversion_patterns.transform.test_type_converter
-  } {legal_ops = ["func.func", "func.return", "test.new_op"],
-     partial_conversion} : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_conversion_patterns to %0 {
+      transform.apply_conversion_patterns.transform.test_conversion_patterns
+    } with type_converter {
+      transform.apply_conversion_patterns.transform.test_type_converter
+    } {legal_ops = ["func.func", "func.return", "test.new_op"],
+       partial_conversion} : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-error @below{{pattern descriptor does not specify type converter and apply_conversion_patterns op has no default type converter}}
-  transform.apply_conversion_patterns to %0 {
-    // expected-note @below{{pattern descriptor op}}
-    transform.apply_conversion_patterns.transform.test_conversion_patterns
-  } {illegal_ops = ["test.foo"]} : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @below{{pattern descriptor does not specify type converter and apply_conversion_patterns op has no default type converter}}
+    transform.apply_conversion_patterns to %0 {
+      // expected-note @below{{pattern descriptor op}}
+      transform.apply_conversion_patterns.transform.test_conversion_patterns
+    } {illegal_ops = ["test.foo"]} : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_conversion_patterns to %0 {
-    // expected-error @below{{expected LLVMTypeConverter}}
-    transform.apply_conversion_patterns.dialect_to_llvm "test"
-  } with type_converter {
-    transform.apply_conversion_patterns.transform.test_type_converter
-  } {illegal_ops = ["test.foo"],
-     legal_ops = ["func.func", "func.return", "test.new_op"]}
-      : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_conversion_patterns to %0 {
+      // expected-error @below{{expected LLVMTypeConverter}}
+      transform.apply_conversion_patterns.dialect_to_llvm "test"
+    } with type_converter {
+      transform.apply_conversion_patterns.transform.test_type_converter
+    } {illegal_ops = ["test.foo"],
+       legal_ops = ["func.func", "func.return", "test.new_op"]}
+        : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_conversion_patterns to %0 {
-    // expected-error @below{{unknown dialect or dialect not loaded: this_dialect_does_not_exist}}
-    transform.apply_conversion_patterns.dialect_to_llvm "this_dialect_does_not_exist"
-  } with type_converter {
-    transform.apply_conversion_patterns.memref.memref_to_llvm_type_converter
-  } {illegal_ops = ["test.foo"],
-     legal_ops = ["func.func", "func.return", "test.new_op"]}
-      : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_conversion_patterns to %0 {
+      // expected-error @below{{unknown dialect or dialect not loaded: this_dialect_does_not_exist}}
+      transform.apply_conversion_patterns.dialect_to_llvm "this_dialect_does_not_exist"
+    } with type_converter {
+      transform.apply_conversion_patterns.memref.memref_to_llvm_type_converter
+    } {illegal_ops = ["test.foo"],
+       legal_ops = ["func.func", "func.return", "test.new_op"]}
+        : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_conversion_patterns to %0 {
-    // expected-error @below{{dialect does not implement ConvertToLLVMPatternInterface or extension was not loaded: transform}}
-    transform.apply_conversion_patterns.dialect_to_llvm "transform"
-  } with type_converter {
-    transform.apply_conversion_patterns.memref.memref_to_llvm_type_converter
-  } {illegal_ops = ["test.foo"],
-     legal_ops = ["func.func", "func.return", "test.new_op"]}
-      : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_conversion_patterns to %0 {
+      // expected-error @below{{dialect does not implement ConvertToLLVMPatternInterface or extension was not loaded: transform}}
+      transform.apply_conversion_patterns.dialect_to_llvm "transform"
+    } with type_converter {
+      transform.apply_conversion_patterns.memref.memref_to_llvm_type_converter
+    } {illegal_ops = ["test.foo"],
+       legal_ops = ["func.func", "func.return", "test.new_op"]}
+        : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
 module attributes { transform.with_named_sequence } {
-func.func @replacement_op_not_found() {
-  // No op replacement can be found, but there are no handles that must be
-  // updated. No error should be reported.
-  "test.container"() ({
-    %0 = "test.foo"() {replace_with_new_op = "test.bar"} : () -> (i32)
-  }) : () -> ()
-  return
-}
+  func.func @replacement_op_not_found() {
+    // No op replacement can be found, but there are no handles that must be
+    // updated. No error should be reported.
+    "test.container"() ({
+      %0 = "test.foo"() {replace_with_new_op = "test.bar"} : () -> (i32)
+    }) : () -> ()
+    return
+  }
 
-transform.named_sequence @patterns(%container: !transform.any_op {transform.readonly}) {
-  transform.apply_patterns to %container {
-    transform.apply_patterns.transform.test_patterns
-  } : !transform.any_op
-  transform.yield
-}
+  transform.named_sequence @patterns(%container: !transform.any_op {transform.readonly}) {
+    transform.apply_patterns to %container {
+      transform.apply_patterns.transform.test_patterns
+    } : !transform.any_op
+    transform.yield
+  }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.annotate %1 "annotated" : !transform.any_op
-  transform.include @patterns failures(propagate) (%0) : (!transform.any_op) -> ()
-}
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.annotate %1 "annotated" : !transform.any_op
+    transform.include @patterns failures(propagate) (%0) : (!transform.any_op) -> ()
+    transform.yield
+  }
 }
diff --git a/mlir/test/Dialect/Transform/test-pdl-extension.mlir b/mlir/test/Dialect/Transform/test-pdl-extension.mlir
index a9710f755312..a3349c1ba505 100644
--- a/mlir/test/Dialect/Transform/test-pdl-extension.mlir
+++ b/mlir/test/Dialect/Transform/test-pdl-extension.mlir
@@ -1,21 +1,26 @@
-// RUN: mlir-opt %s --test-transform-dialect-interpreter -allow-unregistered-dialect --split-input-file --verify-diagnostics
-
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
-    transform.debug.emit_remark_at %0, "matched" : !transform.any_op
-  }
-
-  pdl.pattern @some : benefit(1) {
-    %0 = pdl.operation "test.some_op"
-    pdl.rewrite %0 with "transform.dialect"
-  }
-
-  pdl.pattern @other : benefit(1) {
-    %0 = pdl.operation "test.other_op"
-    pdl.rewrite %0 with "transform.dialect"
+// RUN: mlir-opt %s --transform-interpreter -allow-unregistered-dialect --split-input-file --verify-diagnostics
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb0(%arg1: !transform.any_op):
+        %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
+        transform.debug.emit_remark_at %0, "matched" : !transform.any_op
+      }
+
+      pdl.pattern @some : benefit(1) {
+        %0 = pdl.operation "test.some_op"
+        pdl.rewrite %0 with "transform.dialect"
+      }
+
+      pdl.pattern @other : benefit(1) {
+        %0 = pdl.operation "test.other_op"
+        pdl.rewrite %0 with "transform.dialect"
+      }
+    }
+    transform.yield
   }
 }
 
@@ -28,17 +33,22 @@ transform.with_pdl_patterns {
 
 // -----
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
-    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
-  }
-
-  pdl.pattern @some : benefit(1) {
-    %0 = pdl.operation "test.some_op"
-    pdl.apply_native_constraint "verbose_constraint"(%0 : !pdl.operation)
-    pdl.rewrite %0 with "transform.dialect"
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb1(%arg1: !transform.any_op):
+        %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
+      }
+
+      pdl.pattern @some : benefit(1) {
+        %0 = pdl.operation "test.some_op"
+        pdl.apply_native_constraint "verbose_constraint"(%0 : !pdl.operation)
+        pdl.rewrite %0 with "transform.dialect"
+      }
+    }
+    transform.yield
   }
 }
 
diff --git a/mlir/test/Dialect/Transform/transform-state-extension.mlir b/mlir/test/Dialect/Transform/transform-state-extension.mlir
index a26293fbe51c..e8c0b7a8a3aa 100644
--- a/mlir/test/Dialect/Transform/transform-state-extension.mlir
+++ b/mlir/test/Dialect/Transform/transform-state-extension.mlir
@@ -1,89 +1,95 @@
-// RUN: mlir-opt %s -test-transform-dialect-interpreter -verify-diagnostics -split-input-file
+// RUN: mlir-opt %s -transform-interpreter -verify-diagnostics -split-input-file
 
 // expected-note @below {{associated payload op}}
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
     // expected-remark @below {{extension absent}}
-    test_check_if_test_extension_present %arg0 : !transform.any_op
-    test_add_test_extension "A"
+    transform.test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.test_add_test_extension "A"
     // expected-remark @below {{extension present, A}}
-    test_check_if_test_extension_present %arg0 : !transform.any_op
-    test_remove_test_extension
+    transform.test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.test_remove_test_extension
     // expected-remark @below {{extension absent}}
-    test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.yield
   }
 }
 
 // -----
 
 // expected-note @below {{associated payload op}}
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
-    test_add_test_extension "A"
-    test_remove_test_extension
-    test_add_test_extension "B"
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.test_add_test_extension "A"
+    transform.test_remove_test_extension
+    transform.test_add_test_extension "B"
     // expected-remark @below {{extension present, B}}
-    test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.yield
   }
 }
 
 // -----
 
 // expected-note @below {{associated payload op}}
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
-    test_add_test_extension "A"
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.test_add_test_extension "A"
     // expected-remark @below {{extension present, A}}
-    test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.test_check_if_test_extension_present %arg0 : !transform.any_op
     // expected-note @below {{associated payload op}}
-    test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
     // expected-remark @below {{extension present, A}}
-    test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.yield
   }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  test_add_test_extension "A"
-   // This is okay because we are replacing the top-level module operation
-   // (0 results) with this operation that has _more_ (1) results.
-  %dummy = test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.test_add_test_extension "A"
+     // This is okay because we are replacing the top-level module operation
+     // (0 results) with this operation that has _more_ (1) results.
+    %dummy = transform.test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  test_add_test_extension "A"
-  %dummy = test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
-  // This is still okay. Even though we are replacing the previous
-  // operation with (1 result) with this operation that has less (0) results,
-  // there is no handle to the result, hence no issue with value handle update.
-  test_remap_operand_to_self %dummy : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.test_add_test_extension "A"
+    %dummy = transform.test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
+    // This is still okay. Even though we are replacing the previous
+    // operation with (1 result) with this operation that has less (0) results,
+    // there is no handle to the result, hence no issue with value handle update.
+    transform.test_remap_operand_to_self %dummy : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  test_add_test_extension "A"
-  // expected-error @below {{cannot replace an op with another op producing fewer results while tracking handles}}
-  %dummy = test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
-  %valuehandle = transform.get_result %dummy[0] : (!transform.any_op) -> !transform.any_value
-  test_remap_operand_to_self %dummy : (!transform.any_op) -> ()
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.test_add_test_extension "A"
+    // expected-error @below {{cannot replace an op with another op producing fewer results while tracking handles}}
+    %dummy = transform.test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
+    %valuehandle = transform.get_result %dummy[0] : (!transform.any_op) -> !transform.any_value
+    transform.test_remap_operand_to_self %dummy : (!transform.any_op) -> ()
+    transform.yield
+  }
 }
 
 // -----
 
-module {
-  transform.sequence failures(suppress) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
     // expected-error @below {{TestTransformStateExtension missing}}
-    test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir b/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir
index 94e78ce40a3c..8f0148119806 100644
--- a/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir
+++ b/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir
@@ -262,6 +262,48 @@ func.func @aligned_sitofp_2d(%a: vector<8x32xi4>) -> vector<8x32xf32> {
   return %0 : vector<8x32xf32>
 }
 
+// CHECK-LABEL: func.func @aligned_trunci(
+func.func @aligned_trunci(%a: vector<8xi32>) -> vector<8xi4> {
+// CHECK-SAME:    %[[IN:.*]]: vector<8xi32>) -> vector<8xi4> {
+// CHECK-DAG:       %[[LOW_MASK:.*]] = arith.constant dense<15> : vector<4xi8>
+// CHECK-DAG:       %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8>
+// CHECK:           %[[I8:.*]] = arith.trunci %[[IN]] : vector<8xi32> to vector<8xi8>
+// CHECK:           %[[LOW:.*]] = vector.shuffle %[[I8]], %[[I8]] [0, 2, 4, 6] : vector<8xi8>, vector<8xi8>
+// CHECK:           %[[HIGH:.*]] = vector.shuffle %[[I8]], %[[I8]] [1, 3, 5, 7] : vector<8xi8>, vector<8xi8>
+// CHECK:           %[[ZEROED_LOW:.*]] = arith.andi %[[LOW]], %[[LOW_MASK]] : vector<4xi8>
+// CHECK:           %[[SHL_HIGH:.*]] = arith.shli %[[HIGH]], %[[I4_BITS]] : vector<4xi8>
+// CHECK:           %[[MERGED:.*]] = arith.ori %[[ZEROED_LOW]], %[[SHL_HIGH]] : vector<4xi8>
+// CHECK:           %[[I4:.*]] = vector.bitcast %[[MERGED]] : vector<4xi8> to vector<8xi4>
+  %0 = arith.trunci %a : vector<8xi32> to vector<8xi4>
+  return %0 : vector<8xi4>
+}
+
+// CHECK-LABEL: func.func @aligned_trunci_base_case(
+func.func @aligned_trunci_base_case(%a: vector<8xi8>) -> vector<8xi4> {
+// CHECK-SAME:    %[[IN:.*]]: vector<8xi8>) -> vector<8xi4> {
+// CHECK-DAG:       %[[LOW_MASK:.*]] = arith.constant dense<15> : vector<4xi8>
+// CHECK-DAG:       %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8>
+// CHECK:           %[[LOW:.*]] = vector.shuffle %[[IN]], %[[IN]] [0, 2, 4, 6] : vector<8xi8>, vector<8xi8>
+// CHECK:           %[[HIGH:.*]] = vector.shuffle %[[IN]], %[[IN]] [1, 3, 5, 7] : vector<8xi8>, vector<8xi8>
+// CHECK:           %[[ZEROED_LOW:.*]] = arith.andi %[[LOW]], %[[LOW_MASK]] : vector<4xi8>
+// CHECK:           %[[SHL_HIGH:.*]] = arith.shli %[[HIGH]], %[[I4_BITS]] : vector<4xi8>
+// CHECK:           %[[MERGED:.*]] = arith.ori %[[ZEROED_LOW]], %[[SHL_HIGH]] : vector<4xi8>
+// CHECK:           %[[I4:.*]] = vector.bitcast %[[MERGED]] : vector<4xi8> to vector<8xi4>
+  %0 = arith.trunci %a : vector<8xi8> to vector<8xi4>
+  return %0 : vector<8xi4>
+}
+
+// CHECK-LABEL: func.func @aligned_trunci_2d(
+func.func @aligned_trunci_2d(%a: vector<8x32xi32>) -> vector<8x32xi4> {
+// CHECK-NOT:       vector.shuffle
+// CHECK-NOT:       vector.andi
+// CHECK-NOT:       vector.shli
+// CHECK-NOT:       vector.ori
+// CHECK:           arith.trunci
+  %0 = arith.trunci %a : vector<8x32xi32> to vector<8x32xi4>
+  return %0 : vector<8x32xi4>
+}
+
 // CHECK-LABEL: func.func @i4_transpose(
 func.func @i4_transpose(%a: vector<8x16xi4>) -> vector<16x8xi4> {
 // CHECK-SAME:    %[[IN:.*]]: vector<8x16xi4>) -> vector<16x8xi4> {
diff --git a/mlir/test/Dialect/Vector/vector-transforms.mlir b/mlir/test/Dialect/Vector/vector-transforms.mlir
index ea10bd56390c..eda6a5cc40d9 100644
--- a/mlir/test/Dialect/Vector/vector-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-transforms.mlir
@@ -339,6 +339,51 @@ func.func @bubble_down_bitcast_in_strided_slice_extract_odd_size(%arg0: vector<4
   return %0: vector<3xf16>
 }
 
+// CHECK-LABEL:   func.func @bubble_up_bitcast_in_insert_i4_i8(
+// CHECK-SAME:                                                 %[[VAL:.*]]: vector<32xi4>,
+// CHECK-SAME:                                                 %[[DST:.*]]: vector<8x32xi4>) -> vector<8x16xi8> {
+func.func @bubble_up_bitcast_in_insert_i4_i8(%val: vector<32xi4>, %src: vector<8x32xi4>) -> vector<8x16xi8> {
+// CHECK:           %[[BC_VAL:.*]] = vector.bitcast %[[VAL]] : vector<32xi4> to vector<16xi8>
+// CHECK:           %[[BC_DST:.*]] = vector.bitcast %[[DST]] : vector<8x32xi4> to vector<8x16xi8>
+// CHECK:           vector.insert %[[BC_VAL]], %[[BC_DST]] [4] : vector<16xi8> into vector<8x16xi8>
+  %0 = vector.insert %val, %src[4] : vector<32xi4> into vector<8x32xi4>
+  %1 = vector.bitcast %0 : vector<8x32xi4> to vector<8x16xi8>
+  return %1 : vector<8x16xi8>
+}
+
+// CHECK-LABEL:   func.func @bubble_up_bitcast_in_insert_i8_i4(
+// CHECK-SAME:                                                 %[[VAL:.*]]: vector<16xi8>,
+// CHECK-SAME:                                                 %[[DST:.*]]: vector<8x16xi8>) -> vector<8x32xi4> {
+func.func @bubble_up_bitcast_in_insert_i8_i4(%val: vector<16xi8>, %src: vector<8x16xi8>) -> vector<8x32xi4> {
+// CHECK:           %[[BC_VAL:.*]] = vector.bitcast %[[VAL]] : vector<16xi8> to vector<32xi4>
+// CHECK:           %[[BC_DST:.*]] = vector.bitcast %[[DST]] : vector<8x16xi8> to vector<8x32xi4>
+// CHECK:           vector.insert %[[BC_VAL]], %[[BC_DST]] [4] : vector<32xi4> into vector<8x32xi4>
+  %0 = vector.insert %val, %src[4] : vector<16xi8> into vector<8x16xi8>
+  %1 = vector.bitcast %0 : vector<8x16xi8> to vector<8x32xi4>
+  return %1 : vector<8x32xi4>
+}
+
+// CHECK-LABEL:   func.func @bubble_up_bitcast_in_insert_i32_f32(
+// CHECK-SAME:                                                 %[[VAL:.*]]: vector<16xi32>,
+// CHECK-SAME:                                                 %[[DST:.*]]: vector<8x16xi32>) -> vector<8x16xf32> {
+func.func @bubble_up_bitcast_in_insert_i32_f32(%val: vector<16xi32>, %src: vector<8x16xi32>) -> vector<8x16xf32> {
+// CHECK:           %[[BC_VAL:.*]] = vector.bitcast %[[VAL]] : vector<16xi32> to vector<16xf32>
+// CHECK:           %[[BC_DST:.*]] = vector.bitcast %[[DST]] : vector<8x16xi32> to vector<8x16xf32>
+// CHECK:           vector.insert %[[BC_VAL]], %[[BC_DST]] [4] : vector<16xf32> into vector<8x16xf32>
+  %0 = vector.insert %val, %src[4] : vector<16xi32> into vector<8x16xi32>
+  %1 = vector.bitcast %0 : vector<8x16xi32> to vector<8x16xf32>
+  return %1 : vector<8x16xf32>
+}
+
+// CHECK-LABEL:   func.func @bubble_up_bitcast_in_insert_scalar(
+func.func @bubble_up_bitcast_in_insert_scalar(%val: i8, %src: vector<8x16xi8>) -> vector<8x32xi4> {
+// CHECK:           vector.insert
+// CHECK-NEXT:      vector.bitcast
+  %0 = vector.insert %val, %src[4, 8] : i8 into vector<8x16xi8>
+  %1 = vector.bitcast %0 : vector<8x16xi8> to vector<8x32xi4>
+  return %1 : vector<8x32xi4>
+}
+
 // CHECK-LABEL: func @bubble_up_bitcast_in_strided_slice_insert
 //  CHECK-SAME: (%[[DST:.+]]: vector<8xf16>, %[[SRC1:.+]]: vector<4xf16>, %[[SRC2:.+]]: vector<4xf16>)
 func.func @bubble_up_bitcast_in_strided_slice_insert(%dst: vector<8xf16>, %src1: vector<4xf16>, %src2: vector<4xf16>) -> vector<4xf32> {
diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
index 907260373487..bf90c4a6ebb3 100644
--- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
+++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
@@ -1559,3 +1559,28 @@ func.func @warp_propagate_multi_dim_create_mask(%laneid: index, %m0: index, %m1:
 //       CHECK-PROP:   %[[DISTM0:.+]] = affine.apply #[[$SUBM0]]()[%[[M0]], %[[LANEID]]]
 //       CHECK-PROP:   %[[DISTM1:.+]] = affine.apply #[[$SUBM1]]()[%[[M1]], %[[LANEID]]]
 //       CHECK-PROP:   vector.create_mask %[[DISTM0]], %[[DISTM1]], %[[M2]] : vector<1x2x4xi1>
+
+// -----
+
+func.func @warp_propagate_nd_write(%laneid: index, %dest: memref<4x1024xf32>) {
+  %c0 = arith.constant 0 : index
+  vector.warp_execute_on_lane_0(%laneid)[32] -> () {
+    %0 = "some_def"() : () -> (vector<4x1024xf32>)
+    vector.transfer_write %0, %dest[%c0, %c0] : vector<4x1024xf32>, memref<4x1024xf32>
+    vector.yield
+  }
+  return
+}
+
+//       CHECK-DIST-AND-PROP: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 128)>
+
+// CHECK-DIST-AND-PROP-LABEL: func.func @warp_propagate_nd_write(
+//       CHECK-DIST-AND-PROP:   %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1x128xf32>) {
+//       CHECK-DIST-AND-PROP:     %[[V0:.*]] = "some_def"
+//       CHECK-DIST-AND-PROP:     vector.yield %[[V0]]
+//  CHECK-DIST-AND-PROP-SAME:       vector<4x1024xf32>
+//       CHECK-DIST-AND-PROP:   }
+
+//       CHECK-DIST-AND-PROP:   %[[IDS:.+]]:2 = affine.delinearize_index %{{.*}} into (%c4, %c8) : index, index
+//       CHECK-DIST-AND-PROP:   %[[INNER_ID:.+]] = affine.apply #map()[%[[IDS]]#1]
+//       CHECK-DIST-AND-PROP:   vector.transfer_write %[[W]], %{{.*}}[%[[IDS]]#0, %[[INNER_ID]]] {{.*}} : vector<1x128xf32>
diff --git a/mlir/test/Examples/transform-opt/empty.mlir b/mlir/test/Examples/transform-opt/empty.mlir
new file mode 100644
index 000000000000..b525769db688
--- /dev/null
+++ b/mlir/test/Examples/transform-opt/empty.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-transform-opt %s --transform=%p/self-contained.mlir | FileCheck %s
+// RUN: mlir-transform-opt %s --transform=%p/external-decl.mlir --verify-diagnostics
+// RUN: mlir-transform-opt %s --transform=%p/external-def.mlir --transform-entry-point=external_def | FileCheck %s --check-prefix=EXTERNAL
+// RUN: mlir-transform-opt %s --transform=%p/external-decl.mlir --transform-library=%p/external-def.mlir | FileCheck %s --check-prefix=EXTERNAL
+// RUN: mlir-transform-opt %s --transform=%p/syntax-error.mlir --verify-diagnostics
+// RUN: mlir-transform-opt %s --transform=%p/self-contained.mlir --transform-library=%p/syntax-error.mlir --verify-diagnostics
+// RUN: mlir-transform-opt %s --transform=%p/self-contained.mlir --transform-library=%p/external-def.mlir --transform-library=%p/syntax-error.mlir --verify-diagnostics
+
+// CHECK: IR printer: in self-contained
+// EXTERNAL: IR printer: external_def
+// CHECK-NOT: @__transform_main
+module {}
diff --git a/mlir/test/Examples/transform-opt/external-decl.mlir b/mlir/test/Examples/transform-opt/external-decl.mlir
new file mode 100644
index 000000000000..5a7373589242
--- /dev/null
+++ b/mlir/test/Examples/transform-opt/external-decl.mlir
@@ -0,0 +1,18 @@
+// This test just needs to parse. Note that the diagnostic message below will
+// be produced in *another* multi-file test, do *not* -verify-diagnostics here.
+// RUN: mlir-opt %s
+
+// RUN: mlir-transform-opt %s --transform-library=%p/external-def.mlir | FileCheck %s
+
+module attributes {transform.with_named_sequence} {
+  // The definition should not be printed here.
+  // CHECK: @external_def
+  // CHECK-NOT: transform.print
+  transform.named_sequence private @external_def(%root: !transform.any_op {transform.readonly})
+
+  transform.named_sequence private @__transform_main(%root: !transform.any_op) {
+    // expected-error @below {{unresolved external named sequence}}
+    transform.include @external_def failures(propagate) (%root) : (!transform.any_op) -> ()
+    transform.yield
+  }
+}
diff --git a/mlir/test/Examples/transform-opt/external-def.mlir b/mlir/test/Examples/transform-opt/external-def.mlir
new file mode 100644
index 000000000000..9dc4fbbdd6b6
--- /dev/null
+++ b/mlir/test/Examples/transform-opt/external-def.mlir
@@ -0,0 +1,8 @@
+// RUN: mlir-opt %s
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @external_def(%root: !transform.any_op {transform.readonly}) {
+    transform.print %root { name = "external_def" } : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Examples/transform-opt/pass.mlir b/mlir/test/Examples/transform-opt/pass.mlir
new file mode 100644
index 000000000000..5c7c8bf1e256
--- /dev/null
+++ b/mlir/test/Examples/transform-opt/pass.mlir
@@ -0,0 +1,19 @@
+// RUN: mlir-transform-opt %s | FileCheck %s
+
+module attributes {transform.with_named_sequence} {
+  // CHECK-LABEL: @return_42
+  // CHECK: %[[C42:.+]] = arith.constant 42
+  // CHECK: return %[[C42]]
+  func.func @return_42() -> i32 {
+    %0 = arith.constant 21 : i32
+    %1 = arith.constant 2 : i32
+    %2 = arith.muli %0, %1 : i32
+    return %2 : i32
+  }
+
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %arg1 = transform.apply_registered_pass "canonicalize" to %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.print %arg1 : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Examples/transform-opt/self-contained.mlir b/mlir/test/Examples/transform-opt/self-contained.mlir
new file mode 100644
index 000000000000..b9a93af61b8b
--- /dev/null
+++ b/mlir/test/Examples/transform-opt/self-contained.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-transform-opt %s | FileCheck %s
+// RUN: mlir-transform-opt %s --transform=%s | FileCheck %s
+// RUN: mlir-transform-opt %s --transform=%p/external-decl.mlir --verify-diagnostics
+// RUN: mlir-transform-opt %s --transform=%p/external-def.mlir --transform-entry-point=external_def | FileCheck %s --check-prefix=EXTERNAL
+// RUN: mlir-transform-opt %s --transform=%p/external-decl.mlir --transform-library=%p/external-def.mlir | FileCheck %s --check-prefix=EXTERNAL
+// RUN: mlir-transform-opt %s --transform=%p/syntax-error.mlir --verify-diagnostics
+
+// CHECK: IR printer: in self-contained
+// EXTERNAL: IR printer: external_def
+
+// The first occurrence comes from the print operation and the second is the
+// roundtrip output. However, we shouldn't have the symbol duplicated because
+// of library merging.
+// CHECK-COUNT-2: @__transform_main
+// CHECK-NOT: @__transform_main
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence private @__transform_main(%root: !transform.any_op) {
+    transform.print %root { name = "in self-contained" } : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Examples/transform-opt/syntax-error.mlir b/mlir/test/Examples/transform-opt/syntax-error.mlir
new file mode 100644
index 000000000000..89f1d472fe89
--- /dev/null
+++ b/mlir/test/Examples/transform-opt/syntax-error.mlir
@@ -0,0 +1,5 @@
+// RUN: mlir-opt %s --verify-diagnostics
+// This file is used as additional input.
+
+// expected-error @below {{expected operation name in quotes}}
+module {
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-compare-results-i16.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-compare-results-i16.mlir
index 15bafeda6740..437e49a6b814 100644
--- a/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-compare-results-i16.mlir
+++ b/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-compare-results-i16.mlir
@@ -26,7 +26,7 @@ func.func @check_results(%lhs : i16, %rhs : i16, %res0 : i16, %res1 : i16) -> ()
   %mismatch = arith.cmpi ne, %res0, %res1 : i16
   scf.if %mismatch -> () {
     vector.print %res1 : i16
-    vector.print str "Mismatch"
+    vector.print str "Mismatch\n"
   }
   return
 }
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/fill-2d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/fill-2d.mlir
index 12f13e8dbc4a..881e2799b5b0 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/fill-2d.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/fill-2d.mlir
@@ -88,7 +88,7 @@ func.func @entry() {
   }
 
   // CHECK: SME: END OF TEST OUTPUT
-  vector.print str "SME: END OF TEST OUTPUT"
+  vector.print str "SME: END OF TEST OUTPUT\n"
 
   return
 }
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/use-too-many-tiles.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/use-too-many-tiles.mlir
index ee3866de303e..588b44a36c29 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/use-too-many-tiles.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/use-too-many-tiles.mlir
@@ -24,23 +24,23 @@ func.func @use_too_many_tiles(%a: memref<?x?xi16>, %b:  memref<?x?xi16>, %c: mem
 
   // CHECK-LABEL: tile_a:
   // CHECK-COUNT-8: ( 0, 0, 0, 0, 0, 0, 0, 0
-  vector.print str "tile_a:"
+  vector.print str "tile_a:\n"
   vector.print %tile_a : vector<[8]x[8]xi16>
   // CHECK-LABEL: tile_b:
   // CHECK-COUNT-8: ( 1, 1, 1, 1, 1, 1, 1, 1
-  vector.print str "tile_b:"
+  vector.print str "tile_b:\n"
   vector.print %tile_b : vector<[8]x[8]xi16>
   // CHECK-LABEL: tile_c:
   // CHECK-COUNT-8: ( 2, 2, 2, 2, 2, 2, 2, 2
-  vector.print str "tile_c:"
+  vector.print str "tile_c:\n"
   vector.print %tile_c : vector<[8]x[8]xi16>
   // CHECK-LABEL: tile_d:
   // CHECK-COUNT-8: ( 3, 3, 3, 3, 3, 3, 3, 3
-  vector.print str "tile_d:"
+  vector.print str "tile_d:\n"
   vector.print %tile_d : vector<[8]x[8]xi16>
   // CHECK-LABEL: tile_e:
   // CHECK-COUNT-8: ( 4, 4, 4, 4, 4, 4, 4, 4
-  vector.print str "tile_e:"
+  vector.print str "tile_e:\n"
   vector.print %tile_e : vector<[8]x[8]xi16>
   return
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/Emulated/test-setArmSVLBits.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/Emulated/test-setArmSVLBits.mlir
index 415181171e27..1794564a6a72 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/Emulated/test-setArmSVLBits.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/Emulated/test-setArmSVLBits.mlir
@@ -12,13 +12,13 @@ func.func @checkSVL() {
   %svl_h = arm_sme.streaming_vl <half>
   %svl_w = arm_sme.streaming_vl <word>
   %svl_d = arm_sme.streaming_vl <double>
-  vector.print str "SVL.b"
+  vector.print str "SVL.b\n"
   vector.print %svl_b : index
-  vector.print str "SVL.h"
+  vector.print str "SVL.h\n"
   vector.print %svl_h : index
-  vector.print str "SVL.w"
+  vector.print str "SVL.w\n"
   vector.print %svl_w : index
-  vector.print str "SVL.d"
+  vector.print str "SVL.d\n"
   vector.print %svl_d : index
   return
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir
index 06b1c107cb2c..41e724844fe4 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir
@@ -7,14 +7,6 @@
 
 // RUN: %{compile} | %{run} | FileCheck %s
 
-/// Note: The SME ST1Q/LD1Q instructions are currently broken in QEMU
-/// see: https://gitlab.com/qemu-project/qemu/-/issues/1833
-/// This test is expected to fail until a fixed version of QEMU can be used.
-
-/// FIXME: Remove the 'XFAIL' below once a fixed QEMU version is available
-/// (and installed on CI buildbot).
-// XFAIL: {{.*}}
-
 func.func @print_i8s(%bytes: memref<?xi8>, %len: index) {
   %c0 = arith.constant 0 : index
   %c16 = arith.constant 16 : index
@@ -61,13 +53,13 @@ func.func @test_load_store_zaq0() {
 
   // CHECK-LABEL: INITIAL TILE A:
   // CHECK: ( 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 )
-  vector.print str "INITIAL TILE A:"
+  vector.print str "INITIAL TILE A:\n"
   func.call @print_i8s(%tile_a_bytes, %zaq_size_bytes) : (memref<?xi8>, index) -> ()
   vector.print punctuation <newline>
 
   // CHECK-LABEL: INITIAL TILE B:
   // CHECK: ( 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 )
-  vector.print str "INITIAL TILE B:"
+  vector.print str "INITIAL TILE B:\n"
   func.call @print_i8s(%tile_b_bytes, %zaq_size_bytes) : (memref<?xi8>, index) -> ()
   vector.print punctuation <newline>
 
@@ -76,13 +68,13 @@ func.func @test_load_store_zaq0() {
 
   // CHECK-LABEL: FINAL TILE A:
   // CHECK: ( 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 )
-  vector.print str "FINAL TILE A:"
+  vector.print str "FINAL TILE A:\n"
   func.call @print_i8s(%tile_a_bytes, %zaq_size_bytes) : (memref<?xi8>, index) -> ()
   vector.print punctuation <newline>
 
   // CHECK-LABEL: FINAL TILE B:
   // CHECK: ( 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 )
-  vector.print str "FINAL TILE B:"
+  vector.print str "FINAL TILE B:\n"
   func.call @print_i8s(%tile_b_bytes, %zaq_size_bytes) : (memref<?xi8>, index) -> ()
 
   return
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-load-vertical.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-load-vertical.mlir
index 27be801252b8..68c31ac1dd8e 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-load-vertical.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-load-vertical.mlir
@@ -49,12 +49,12 @@ func.func @entry() {
   // CHECK-NEXT: ( 2, 2, 2, 2
   // CHECK-NEXT: ( 3, 3, 3, 3
   // CHECK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   scf.for %i = %c0 to %za_s_size step %svl_s {
     %tileslice = vector.load %mem1[%i] : memref<?xi32>, vector<[4]xi32>
     vector.print %tileslice : vector<[4]xi32>
   }
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   // 2. VERTICAL LAYOUT
   // Dump "mem2". The smallest SVL is 128-bits so the tile will be at least
@@ -66,9 +66,9 @@ func.func @entry() {
   // CHECK-NEXT: ( 0, 1, 2, 3
   // CHECK-NEXT: ( 0, 1, 2, 3
   // CHECK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %0 : vector<[4]x[4]xi32>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-multi-tile-transpose.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-multi-tile-transpose.mlir
index 9d836d93c85b..cd48f2a9ebfd 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-multi-tile-transpose.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-multi-tile-transpose.mlir
@@ -46,12 +46,12 @@ func.func @testTransposedReadWithMask(%maskRows: index, %maskCols: index) {
   vector.transfer_write %readTransposed, %outDyn[%c0, %c0] {in_bounds = [true, true]} : vector<[16]x[4]xf32>, memref<?x?xf32>
 
   /// Print the input memref.
-  vector.print str "Input memref:"
+  vector.print str "Input memref:\n"
   %inUnranked = memref.cast %inDyn : memref<?x?xf32> to memref<*xf32>
   call @printMemrefF32(%inUnranked) : (memref<*xf32>) -> ()
 
   /// Print the result memref.
-  vector.print str "Masked transposed result:"
+  vector.print str "Masked transposed result:\n"
   %outUnranked = memref.cast %outDyn : memref<?x?xf32> to memref<*xf32>
   call @printMemrefF32(%outUnranked) : (memref<*xf32>) -> ()
 
@@ -84,12 +84,12 @@ func.func @testTransposedWriteWithMask(%maskRows: index, %maskCols: index) {
     : vector<[16]x[4]xf32>, memref<?x?xf32>
 
   /// Print the input memref.
-  vector.print str "Input memref:"
+  vector.print str "Input memref:\n"
   %inUnranked = memref.cast %inDyn : memref<?x?xf32> to memref<*xf32>
   call @printMemrefF32(%inUnranked) : (memref<*xf32>) -> ()
 
   /// Print the result memref.
-  vector.print str "Masked transposed result:"
+  vector.print str "Masked transposed result:\n"
   %outUnranked = memref.cast %outDyn : memref<?x?xf32> to memref<*xf32>
   call @printMemrefF32(%outUnranked) : (memref<*xf32>) -> ()
 
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir
index 7e7869d1c957..fb6c06cfd699 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir
@@ -35,9 +35,9 @@ func.func @test_outerproduct_no_accumulator_4x4xf32() {
   // WITHOUT-ACC-NEXT: ( 0, 2, 4, 6
   // WITHOUT-ACC-NEXT: ( 0, 3, 6, 9
   // WITHOUT-ACC:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[4]x[4]xf32>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
@@ -60,9 +60,9 @@ func.func @test_outerproduct_with_accumulator_4x4xf32() {
   // WITH-ACC-NEXT: ( 10, 12, 14, 16
   // WITH-ACC-NEXT: ( 10, 13, 16, 19
   // WITH-ACC:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[4]x[4]xf32>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
@@ -91,9 +91,9 @@ func.func @test_masked_outerproduct_no_accumulator_4x4xf32() {
   // WITH-MASK-NEXT: ( 3, 6, 0, 0
   // WITH-MASK-NEXT: ( 0, 0, 0, 0
   // WITH-MASK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[4]x[4]xf32>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
@@ -124,9 +124,9 @@ func.func @test_masked_outerproduct_with_accumulator_4x4xf32() {
   // WITH-MASK-AND-ACC-NEXT: ( 10, 10, 10, 10
   // WITH-MASK-AND-ACC-NEXT: ( 10, 10, 10, 10
   // WITH-MASK-AND-ACC:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[4]x[4]xf32>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir
index 46bf799232ae..b8458606d3f3 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir
@@ -40,9 +40,9 @@ func.func @test_outerproduct_no_accumulator_2x2xf64() {
   // CHECK-NEXT: ( 1, 2
   // CHECK-NEXT: ( 2, 4
   // CHECK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[2]x[2]xf64>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
@@ -66,9 +66,9 @@ func.func @test_outerproduct_with_accumulator_2x2xf64() {
   // WITH-ACC-NEXT: ( 11, 12
   // WITH-ACC-NEXT: ( 12, 14
   // WITH-ACC:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[2]x[2]xf64>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
@@ -96,9 +96,9 @@ func.func @test_masked_outerproduct_no_accumulator_2x2xf64() {
   // WITH-MASK-NEXT: ( 1, 0
   // WITH-MASK-NEXT: ( 2, 0
   // WITH-MASK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[2]x[2]xf64>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
@@ -127,9 +127,9 @@ func.func @test_masked_outerproduct_with_accumulator_2x2xf64() {
   // WITH-MASK-AND-ACC-NEXT: ( 11, 12
   // WITH-MASK-AND-ACC-NEXT: ( 10, 10
   // WITH-MASK-AND-ACC:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[2]x[2]xf64>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-read-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-read-2d.mlir
index 52f56883cad9..7421521b96bf 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-read-2d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-read-2d.mlir
@@ -14,7 +14,7 @@ func.func @transfer_read_2d(%A : memref<?x?xf32>, %base1: index, %base2: index)
   %0 = vector.transfer_read %A[%base1, %base2], %pad {in_bounds=[true, true]} :
     memref<?x?xf32>, vector<[4]x[4]xf32>
 
-  vector.print str "TILE BEGIN:"
+  vector.print str "TILE BEGIN:\n"
   vector.print %0: vector<[4]x[4]xf32>
 
   return
@@ -27,7 +27,7 @@ func.func @transfer_read_2d_transposed(%A : memref<?x?xf32>, %base1: index, %bas
     {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds=[true, true]}
       : memref<?x?xf32>, vector<[4]x[4]xf32>
 
-  vector.print str "TILE BEGIN:"
+  vector.print str "TILE BEGIN:\n"
   vector.print %0 : vector<[4]x[4]xf32>
 
   return
@@ -42,7 +42,7 @@ func.func @transfer_read_2d_mask(%A : memref<?x?xf32>, %base1: index, %base2: in
   %0 = vector.transfer_read %A[%base1, %base2], %pad, %mask
     {in_bounds = [true, true]} : memref<?x?xf32>, vector<[4]x[4]xf32>
 
-  vector.print str "TILE BEGIN:"
+  vector.print str "TILE BEGIN:\n"
   vector.print %0: vector<[4]x[4]xf32>
 
   return
@@ -58,7 +58,7 @@ func.func @transfer_read_2d_mask_transposed(%A : memref<?x?xf32>, %base1: index,
     {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds=[true, true]}
       : memref<?x?xf32>, vector<[4]x[4]xf32>
 
-  vector.print str "TILE BEGIN:"
+  vector.print str "TILE BEGIN:\n"
   vector.print %0: vector<[4]x[4]xf32>
 
   return
@@ -73,7 +73,7 @@ func.func @transfer_read_2d_mask_non_zero_pad(%A : memref<?x?xf32>, %base1: inde
   %0 = vector.transfer_read %A[%base1, %base2], %pad, %mask
     {in_bounds = [true, true]} : memref<?x?xf32>, vector<[4]x[4]xf32>
 
-  vector.print str "TILE BEGIN:"
+  vector.print str "TILE BEGIN:\n"
   vector.print %0: vector<[4]x[4]xf32>
 
   return
@@ -89,7 +89,7 @@ func.func @transfer_read_2d_mask_non_zero_pad_transposed(%A : memref<?x?xf32>, %
     {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds=[true, true]}
       : memref<?x?xf32>, vector<[4]x[4]xf32>
 
-  vector.print str "TILE BEGIN:"
+  vector.print str "TILE BEGIN:\n"
   vector.print %0: vector<[4]x[4]xf32>
 
   return
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir
index 710cc6672f00..2fef705861f2 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir
@@ -51,7 +51,7 @@ func.func @transfer_write_2d_mask_transposed(%A : memref<?x?xf32>, %base1: index
 func.func @load_and_print(%A : memref<?x?xf32>, %base1: index, %base2: index) {
   %0 = vector.load %A[%base1, %base2] : memref<?x?xf32>, vector<[4]x[4]xf32>
 
-  vector.print str "TILE BEGIN:"
+  vector.print str "TILE BEGIN:\n"
   vector.print %0: vector<[4]x[4]xf32>
 
   return
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transpose.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transpose.mlir
index 88bc0d0709d4..177c96f1d8aa 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transpose.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transpose.mlir
@@ -51,9 +51,9 @@ func.func @entry() {
   // CHECK-NEXT: ( 2, 2, 2, 2
   // CHECK-NEXT: ( 3, 3, 3, 3
   // CHECK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[4]x[4]xi32>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   // Dump the transposed tile. The smallest SVL is 128-bits so the tile will be
   // at least 4x4xi32.
@@ -64,9 +64,9 @@ func.func @entry() {
   // CHECK-NEXT: ( 0, 1, 2, 3
   // CHECK-NEXT: ( 0, 1, 2, 3
   // CHECK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %transposed_tile : vector<[4]x[4]xi32>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/tile_fill.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/tile_fill.mlir
index e14917486d84..3d74508cd23b 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/tile_fill.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/tile_fill.mlir
@@ -23,9 +23,9 @@ func.func @entry() -> i32 {
   // CHECK-NEXT: ( 123, 123, 123, 123
   // CHECK-NEXT: ( 123, 123, 123, 123
   // CHECK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[4]x[4]xi32>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   %c0_i32 = arith.constant 0 : i32
   return %c0_i32 : i32
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-load-store.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-load-store.mlir
index b29790db14dd..48080fd0a26a 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-load-store.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-load-store.mlir
@@ -255,7 +255,7 @@ func.func @load_store_two_za_s_tiles() -> i32 {
   // CHECK-NEXT: ( 1, 1, 1, 1
   // CHECK-NEXT: ( 1, 1, 1, 1
   // CHECK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   scf.for %i = %c0 to %size_of_two_tiles step %svl_s {
     %av = vector.load %mem2[%i] : memref<?xi32>, vector<[4]xi32>
     vector.print %av : vector<[4]xi32>
@@ -263,11 +263,11 @@ func.func @load_store_two_za_s_tiles() -> i32 {
     %tileSizeMinusStep = arith.subi %size_of_tile, %svl_s : index
     %isNextTile = arith.cmpi eq, %i, %tileSizeMinusStep : index
     scf.if %isNextTile {
-      vector.print str "TILE END"
-      vector.print str "TILE BEGIN"
+      vector.print str "TILE END\n"
+      vector.print str "TILE BEGIN\n"
     }
   }
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return %c0_i32 : i32
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir
index c486bf0de5d3..afb23e8e5206 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir
@@ -24,7 +24,7 @@ func.func @read_and_print_2d_vector(%memref: memref<3x?xf32>)  {
   /// Print each of the vectors.
   /// vscale is >= 1, so at least 8 elements will be printed.
 
-  vector.print str "read_and_print_2d_vector()"
+  vector.print str "read_and_print_2d_vector()\n"
   // CHECK-LABEL: read_and_print_2d_vector()
   // CHECK: ( 8, 8, 8, 8, 8, 8, 8, 8
   vector.print %row0 : vector<[8]xf32>
@@ -62,21 +62,21 @@ func.func @add_arrays_of_scalable_vectors(%a: memref<1x2x?xf32>, %b: memref<1x2x
   // CHECK-LABEL: Vector A
   // CHECK-NEXT: ( 5, 5, 5, 5
   // CHECK-NEXT: ( 5, 5, 5, 5
-  vector.print str "\nVector A"
+  vector.print str "\nVector A\n"
   %vector_a = vector.transfer_read %a[%c0, %c0, %c0], %cst, %mask_a {in_bounds = [true, true, true]} : memref<1x2x?xf32>, vector<1x2x[4]xf32>
   func.call @print_1x2xVSCALExf32(%vector_a) : (vector<1x2x[4]xf32>) -> ()
 
   // CHECK-LABEL: Vector B
   // CHECK-NEXT: ( 4, 4, 4, 4
   // CHECK-NEXT: ( 4, 4, 4, 4
-  vector.print str "\nVector B"
+  vector.print str "\nVector B\n"
   %vector_b = vector.transfer_read %b[%c0, %c0, %c0], %cst, %mask_b {in_bounds = [true, true, true]} : memref<1x2x?xf32>, vector<1x2x[4]xf32>
   func.call @print_1x2xVSCALExf32(%vector_b) : (vector<1x2x[4]xf32>) -> ()
 
   // CHECK-LABEL: Sum
   // CHECK-NEXT: ( 9, 9, 9, 9
   // CHECK-NEXT: ( 9, 9, 9, 9
-  vector.print str "\nSum"
+  vector.print str "\nSum\n"
   %sum = arith.addf %vector_a, %vector_b : vector<1x2x[4]xf32>
   func.call @print_1x2xVSCALExf32(%sum) : (vector<1x2x[4]xf32>) -> ()
 
@@ -97,7 +97,7 @@ func.func @entry() {
 
   linalg.fill ins(%f32_8 : f32) outs(%test_1_memref :memref<3x?xf32>)
 
-  vector.print str "=> Print and read 2D arrays of scalable vectors:"
+  vector.print str "=> Print and read 2D arrays of scalable vectors:\n"
   func.call @read_and_print_2d_vector(%test_1_memref) : (memref<3x?xf32>) -> ()
 
   vector.print str "\n====================\n"
@@ -109,7 +109,7 @@ func.func @entry() {
   linalg.fill ins(%f32_5 : f32) outs(%test_2_memref_a :memref<1x2x?xf32>)
   linalg.fill ins(%f32_4 : f32) outs(%test_2_memref_b :memref<1x2x?xf32>)
 
-  vector.print str "=> Reading and adding two 3D arrays of scalable vectors:"
+  vector.print str "=> Reading and adding two 3D arrays of scalable vectors:\n"
   func.call @add_arrays_of_scalable_vectors(
     %test_2_memref_a, %test_2_memref_b) : (memref<1x2x?xf32>, memref<1x2x?xf32>) -> ()
 
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-contraction.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-contraction.mlir
index d86ff56d79e3..79121bf31c26 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-contraction.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-contraction.mlir
@@ -1,4 +1,4 @@
-// DEFINE: %{compile} = mlir-opt %s  -test-transform-dialect-interpreter -test-transform-dialect-erase-schedule\
+// DEFINE: %{compile} = mlir-opt %s  -transform-interpreter -test-transform-dialect-erase-schedule\
 // DEFINE:    -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage\
 // DEFINE:    -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm -o %t
 // DEFINE: %{entry} =
@@ -188,12 +188,14 @@ func.func @matmul_f32() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%module_op: !transform.any_op):
-  %f = transform.structured.match ops{["func.func"]} in %module_op
-    : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%module_op: !transform.any_op) {
+    %f = transform.structured.match ops{["func.func"]} in %module_op
+      : (!transform.any_op) -> !transform.any_op
 
-  transform.apply_patterns to %f {
-    transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct"
-  } : !transform.any_op
+    transform.apply_patterns to %f {
+      transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct"
+    } : !transform.any_op
+    transform.yield
+  }
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-print-str.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-print-str.mlir
index 78d6609ccaf9..25a44f22c2dc 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-print-str.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-print-str.mlir
@@ -7,8 +7,8 @@
 
 func.func @entry() {
    // CHECK: Hello, World!
-   vector.print str "Hello, World!"
+   vector.print str "Hello, World!\n"
    // CHECK-NEXT: Bye!
-   vector.print str "Bye!"
+   vector.print str "Bye!\n"
    return
 }
diff --git a/mlir/test/Target/LLVMIR/attribute-alias-scopes.mlir b/mlir/test/Target/LLVMIR/attribute-alias-scopes.mlir
index 4434aea4ec96..fa3395533af2 100644
--- a/mlir/test/Target/LLVMIR/attribute-alias-scopes.mlir
+++ b/mlir/test/Target/LLVMIR/attribute-alias-scopes.mlir
@@ -59,14 +59,48 @@ llvm.func @alias_scopes(%arg1 : !llvm.ptr) {
 #alias_scope1 = #llvm.alias_scope<id = distinct[1]<>, domain = #alias_scope_domain>
 
 // CHECK-LABEL: @noalias_intr_only
-llvm.func @noalias_intr_only(%arg1 : !llvm.ptr) {
-  %0 = llvm.mlir.constant(0 : i32) : i32
-  // CHECK:  call void @llvm.experimental.noalias.scope.decl(metadata ![[SCOPES1:[0-9]+]])
+llvm.func @noalias_intr_only() {
+  // CHECK: call void @llvm.experimental.noalias.scope.decl(metadata ![[SCOPES:[0-9]+]])
   llvm.intr.experimental.noalias.scope.decl #alias_scope1
   llvm.return
 }
 
 // Check the translated metadata.
 // CHECK-DAG: ![[DOMAIN:[0-9]+]] = distinct !{![[DOMAIN]], !"The domain"}
-// CHECK-DAG: ![[SCOPE1:[0-9]+]] = distinct !{![[SCOPE1]], ![[DOMAIN]]}
-// CHECK-DAG: ![[SCOPES1]] = !{![[SCOPE1]]}
+// CHECK-DAG: ![[SCOPE:[0-9]+]] = distinct !{![[SCOPE]], ![[DOMAIN]]}
+// CHECK-DAG: ![[SCOPES]] = !{![[SCOPE]]}
+
+// -----
+
+// This test ensures the alias scope translation creates a temporary metadata
+// node as a placeholder for self-references. Without this, the debug info
+// translation of a type list with a null entry could inadvertently reference
+// access group metadata. This occurs when both translations generate a metadata
+// list with a null entry, which are then uniqued to the same metadata node.
+// The access group translation subsequently updates the null entry to a
+// self-reference, which causes the type list to reference the access
+// group node as well. The use of a temporary placeholder node avoids the issue.
+
+#alias_scope_domain = #llvm.alias_scope_domain<id = distinct[0]<>>
+#alias_scope = #llvm.alias_scope<id = distinct[1]<>, domain = #alias_scope_domain>
+
+#di_null_type = #llvm.di_null_type
+#di_subroutine_type = #llvm.di_subroutine_type<types = #di_null_type>
+#di_file = #llvm.di_file<"attribute-alias-scope.mlir" in "">
+#di_compile_unit = #llvm.di_compile_unit<id = distinct[3]<>, sourceLanguage = DW_LANG_C11, file = #di_file, isOptimized = true, emissionKind = Full>
+#di_subprogram = #llvm.di_subprogram<id = distinct[2]<>, compileUnit = #di_compile_unit, scope = #di_file, file = #di_file, subprogramFlags = "Definition", type = #di_subroutine_type>
+
+// CHECK-LABEL: @self_reference
+llvm.func @self_reference() {
+  // CHECK: call void @llvm.experimental.noalias.scope.decl(metadata ![[SCOPES:[0-9]+]])
+  llvm.intr.experimental.noalias.scope.decl #alias_scope
+  llvm.return
+} loc(fused<#di_subprogram>[unknown])
+
+// Check that the translated subroutine types do not reference the access group
+// domain since both of them are created as metadata list with a null entry.
+// CHECK-DAG: ![[DOMAIN:[0-9]+]] = distinct !{![[DOMAIN]]}
+// CHECK-DAG: ![[SCOPE:[0-9]+]] = distinct !{![[SCOPE]], ![[DOMAIN]]}
+// CHECK-DAG: ![[SCOPES]] = !{![[SCOPE]]}
+// CHECK-DAG: = !DISubroutineType(types: ![[TYPES:[0-9]+]])
+// CHECK-DAG: ![[TYPES]] = !{null}
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index 39a1e036e85c..12bd108ba86c 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -2186,6 +2186,43 @@ llvm.func @single_nowait(%x: i32, %y: i32, %zaddr: !llvm.ptr) {
 
 // -----
 
+llvm.func @copy_i32(!llvm.ptr, !llvm.ptr)
+llvm.func @copy_f32(!llvm.ptr, !llvm.ptr)
+
+// CHECK-LABEL: @single_copyprivate
+// CHECK-SAME: (ptr %[[ip:.*]], ptr %[[fp:.*]])
+llvm.func @single_copyprivate(%ip: !llvm.ptr, %fp: !llvm.ptr) {
+  // CHECK: %[[didit_addr:.*]] = alloca i32
+  // CHECK: store i32 0, ptr %[[didit_addr]]
+  // CHECK: call i32 @__kmpc_single
+  omp.single copyprivate(%ip -> @copy_i32 : !llvm.ptr, %fp -> @copy_f32 : !llvm.ptr) {
+    // CHECK: %[[i:.*]] = load i32, ptr %[[ip]]
+    %i = llvm.load %ip : !llvm.ptr -> i32
+    // CHECK: %[[i2:.*]] = add i32 %[[i]], %[[i]]
+    %i2 = llvm.add %i, %i : i32
+    // CHECK: store i32 %[[i2]], ptr %[[ip]]
+    llvm.store %i2, %ip : i32, !llvm.ptr
+    // CHECK: %[[f:.*]] = load float, ptr %[[fp]]
+    %f = llvm.load %fp : !llvm.ptr -> f32
+    // CHECK: %[[f2:.*]] = fadd float %[[f]], %[[f]]
+    %f2 = llvm.fadd %f, %f : f32
+    // CHECK: store float %[[f2]], ptr %[[fp]]
+    llvm.store %f2, %fp : f32, !llvm.ptr
+    // CHECK: store i32 1, ptr %[[didit_addr]]
+    // CHECK: call void @__kmpc_end_single
+    // CHECK: %[[didit:.*]] = load i32, ptr %[[didit_addr]]
+    // CHECK: call void @__kmpc_copyprivate({{.*}}, ptr %[[ip]], ptr @copy_i32, i32 %[[didit]])
+    // CHECK: %[[didit2:.*]] = load i32, ptr %[[didit_addr]]
+    // CHECK: call void @__kmpc_copyprivate({{.*}}, ptr %[[fp]], ptr @copy_f32, i32 %[[didit2]])
+    // CHECK-NOT: call void @__kmpc_barrier
+    omp.terminator
+  }
+  // CHECK: ret void
+  llvm.return
+}
+
+// -----
+
 // CHECK: @_QFsubEx = internal global i32 undef
 // CHECK: @_QFsubEx.cache = common global ptr null
 
diff --git a/mlir/test/Target/LLVMIR/openmp-private.mlir b/mlir/test/Target/LLVMIR/openmp-private.mlir
new file mode 100644
index 000000000000..58bda87c3b7b
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-private.mlir
@@ -0,0 +1,142 @@
+// Test code-gen for `omp.parallel` ops with delayed privatizers (i.e. using
+// `omp.private` ops).
+
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+llvm.func @parallel_op_1_private(%arg0: !llvm.ptr) {
+  omp.parallel private(@x.privatizer %arg0 -> %arg2 : !llvm.ptr) {
+    %0 = llvm.load %arg2 : !llvm.ptr -> f32
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: @parallel_op_1_private
+// CHECK-SAME: (ptr %[[ORIG:.*]]) {
+// CHECK: %[[OMP_PAR_ARG:.*]] = alloca { ptr }, align 8
+// CHECK: %[[ORIG_GEP:.*]] = getelementptr { ptr }, ptr %[[OMP_PAR_ARG]], i32 0, i32 0
+// CHECK: store ptr %[[ORIG]], ptr %[[ORIG_GEP]], align 8
+// CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @1, i32 1, ptr @parallel_op_1_private..omp_par, ptr %[[OMP_PAR_ARG]])
+// CHECK: }
+
+// CHECK-LABEL: void @parallel_op_1_private..omp_par
+// CHECK-SAME: (ptr noalias %{{.*}}, ptr noalias %{{.*}}, ptr %[[ARG:.*]])
+// CHECK: %[[ORIG_PTR_PTR:.*]] = getelementptr { ptr }, ptr %[[ARG]], i32 0, i32 0
+// CHECK: %[[ORIG_PTR:.*]] = load ptr, ptr %[[ORIG_PTR_PTR]], align 8
+
+// Check that the privatizer alloc region was inlined properly.
+// CHECK: %[[PRIV_ALLOC:.*]] = alloca float, align 4
+// CHECK: %[[ORIG_VAL:.*]] = load float, ptr %[[ORIG_PTR]], align 4
+// CHECK: store float %[[ORIG_VAL]], ptr %[[PRIV_ALLOC]], align 4
+// CHECK-NEXT: br
+
+// Check that the privatized value is used (rather than the original one).
+// CHECK: load float, ptr %[[PRIV_ALLOC]], align 4
+// CHECK: }
+
+llvm.func @parallel_op_2_privates(%arg0: !llvm.ptr, %arg1: !llvm.ptr) {
+  omp.parallel private(@x.privatizer %arg0 -> %arg2 : !llvm.ptr, @y.privatizer %arg1 -> %arg3 : !llvm.ptr) {
+    %0 = llvm.load %arg2 : !llvm.ptr -> f32
+    %1 = llvm.load %arg3 : !llvm.ptr -> i32
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: @parallel_op_2_privates
+// CHECK-SAME: (ptr %[[ORIG1:.*]], ptr %[[ORIG2:.*]]) {
+// CHECK: %[[OMP_PAR_ARG:.*]] = alloca { ptr, ptr }, align 8
+// CHECK: %[[ORIG1_GEP:.*]] = getelementptr { ptr, ptr }, ptr %[[OMP_PAR_ARG]], i32 0, i32 0
+// CHECK: store ptr %[[ORIG1]], ptr %[[ORIG1_GEP]], align 8
+// CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @1, i32 1, ptr @parallel_op_2_privates..omp_par, ptr %[[OMP_PAR_ARG]])
+// CHECK: }
+
+// CHECK-LABEL: void @parallel_op_2_privates..omp_par
+// CHECK-SAME: (ptr noalias %{{.*}}, ptr noalias %{{.*}}, ptr %[[ARG:.*]])
+// CHECK: %[[ORIG1_PTR_PTR:.*]] = getelementptr { ptr, ptr }, ptr %[[ARG]], i32 0, i32 0
+// CHECK: %[[ORIG1_PTR:.*]] = load ptr, ptr %[[ORIG1_PTR_PTR]], align 8
+// CHECK: %[[ORIG2_PTR_PTR:.*]] = getelementptr { ptr, ptr }, ptr %[[ARG]], i32 0, i32 1
+// CHECK: %[[ORIG2_PTR:.*]] = load ptr, ptr %[[ORIG2_PTR_PTR]], align 8
+
+// Check that the privatizer alloc region was inlined properly.
+// CHECK: %[[PRIV1_ALLOC:.*]] = alloca float, align 4
+// CHECK: %[[ORIG1_VAL:.*]] = load float, ptr %[[ORIG1_PTR]], align 4
+// CHECK: store float %[[ORIG1_VAL]], ptr %[[PRIV1_ALLOC]], align 4
+// CHECK: %[[PRIV2_ALLOC:.*]] = alloca i32, align 4
+// CHECK: %[[ORIG2_VAL:.*]] = load i32, ptr %[[ORIG2_PTR]], align 4
+// CHECK: store i32 %[[ORIG2_VAL]], ptr %[[PRIV2_ALLOC]], align 4
+// CHECK-NEXT: br
+
+// Check that the privatized value is used (rather than the original one).
+// CHECK: load float, ptr %[[PRIV1_ALLOC]], align 4
+// CHECK: load i32, ptr %[[PRIV2_ALLOC]], align 4
+// CHECK: }
+
+omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %0 = llvm.alloca %c1 x f32 : (i32) -> !llvm.ptr
+  %1 = llvm.load %arg0 : !llvm.ptr -> f32
+  llvm.store %1, %0 : f32, !llvm.ptr
+  omp.yield(%0 : !llvm.ptr)
+}
+
+omp.private {type = private} @y.privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
+  %1 = llvm.load %arg0 : !llvm.ptr -> i32
+  llvm.store %1, %0 : i32, !llvm.ptr
+  omp.yield(%0 : !llvm.ptr)
+}
+
+// -----
+
+llvm.func @parallel_op_private_multi_block(%arg0: !llvm.ptr) {
+  omp.parallel private(@multi_block.privatizer %arg0 -> %arg2 : !llvm.ptr) {
+    %0 = llvm.load %arg2 : !llvm.ptr -> f32
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: define internal void @parallel_op_private_multi_block..omp_par
+// CHECK: omp.par.entry:
+// CHECK:  %[[ORIG_PTR_PTR:.*]] = getelementptr { ptr }, ptr %{{.*}}, i32 0, i32 0
+// CHECK:  %[[ORIG_PTR:.*]] = load ptr, ptr %[[ORIG_PTR_PTR]], align 8
+// CHECK:   br label %[[PRIV_BB1:.*]]
+
+// Check contents of the first block in the `alloc` region.
+// CHECK: [[PRIV_BB1]]:
+// CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = alloca float, align 4
+// CHECK-NEXT:   br label %[[PRIV_BB2:.*]]
+
+// Check contents of the second block in the `alloc` region.
+// CHECK: [[PRIV_BB2]]:
+// CHECK-NEXT:   %[[ORIG_PTR2:.*]] = phi ptr [ %[[ORIG_PTR]], %[[PRIV_BB1]] ]
+// CHECK-NEXT:   %[[PRIV_ALLOC2:.*]] = phi ptr [ %[[PRIV_ALLOC]], %[[PRIV_BB1]] ]
+// CHECK-NEXT:   %[[ORIG_VAL:.*]] = load float, ptr %[[ORIG_PTR2]], align 4
+// CHECK-NEXT:   store float %[[ORIG_VAL]], ptr %[[PRIV_ALLOC2]], align 4
+// CHECK-NEXT:   br label %[[PRIV_CONT:.*]]
+
+// Check that the privatizer's continuation block yileds the private clone's
+// address.
+// CHECK: [[PRIV_CONT]]:
+// CHECK-NEXT:   %[[PRIV_ALLOC3:.*]] = phi ptr [ %[[PRIV_ALLOC2]], %[[PRIV_BB2]] ]
+// CHECK-NEXT:   br label %[[PAR_REG:.*]]
+
+// Check that the body of the parallel region loads from the private clone.
+// CHECK: [[PAR_REG]]:
+// CHECK:        %{{.*}} = load float, ptr %[[PRIV_ALLOC3]], align 4
+
+omp.private {type = private} @multi_block.privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %0 = llvm.alloca %c1 x f32 : (i32) -> !llvm.ptr
+  llvm.br ^bb1(%arg0, %0 : !llvm.ptr, !llvm.ptr)
+
+^bb1(%arg1: !llvm.ptr, %arg2: !llvm.ptr):
+  %1 = llvm.load %arg1 : !llvm.ptr -> f32
+  llvm.store %1, %arg2 : f32, !llvm.ptr
+  omp.yield(%arg2 : !llvm.ptr)
+}
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 06b78650c8d0..3ea6292c679d 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -56,6 +56,12 @@ llvm.func @known_block_sizes()
   llvm.return
 }
 
+llvm.func @kernel_func_no_uniform_work_groups() attributes {rocdl.kernel, rocdl.uniform_work_group_size = false} {
+  // CHECK-LABEL: amdgpu_kernel void @kernel_func_no_uniform_work_groups()
+  // CHECK: #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS:[0-9]+]]
+  llvm.return
+}
+
 llvm.func @rocdl.lane_id() -> i32 {
   // CHECK: [[mbcntlo:%.+]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   // CHECK-NEXT: call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[mbcntlo]])
@@ -505,8 +511,9 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 {
   llvm.return %source5 : i32
 }
 
-// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" }
+// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="true" }
 // CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
 // CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"
+// CHECK-DAG: attributes #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="false" }
 // CHECK-DAG: ![[$RANGE]] = !{i32 0, i32 64}
 // CHECK-DAG: ![[$REQD_WORK_GROUP_SIZE]] = !{i32 16, i32 4, i32 2}
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index bde4255ee4b3..abc0e43c7b7f 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -1768,7 +1768,6 @@ struct TestMergeSingleBlockOps
     rewriter.inlineBlockBefore(&innerBlock, op);
     rewriter.eraseOp(innerTerminator);
     rewriter.eraseOp(op);
-    rewriter.modifyOpInPlace(op, [] {});
     return success();
   }
 };
diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
index 178a58e796b2..915f713f7047 100644
--- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
@@ -630,15 +630,13 @@ struct TestVectorDistribution
     });
     MLIRContext *ctx = &getContext();
     auto distributionFn = [](Value val) {
-      // Create a map (d0, d1) -> (d1) to distribute along the inner
-      // dimension. Once we support n-d distribution we can add more
-      // complex cases.
+      // Create an identity dim map of the same rank as the vector.
       VectorType vecType = dyn_cast<VectorType>(val.getType());
       int64_t vecRank = vecType ? vecType.getRank() : 0;
       OpBuilder builder(val.getContext());
       if (vecRank == 0)
         return AffineMap::get(val.getContext());
-      return AffineMap::get(vecRank, 0, builder.getAffineDimExpr(vecRank - 1));
+      return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext());
     };
     auto shuffleFn = [](Location loc, OpBuilder &builder, Value val,
                         Value srcIdx, int64_t warpSz) {
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index 904dfb680a04..7636ef30c2d3 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -161,6 +161,7 @@ tools.extend(
         ToolSubst("transform-opt-ch2", unresolved="ignore"),
         ToolSubst("transform-opt-ch3", unresolved="ignore"),
         ToolSubst("transform-opt-ch4", unresolved="ignore"),
+        ToolSubst("mlir-transform-opt", unresolved="ignore"),
         ToolSubst("%mlir_lib_dir", config.mlir_lib_dir, unresolved="ignore"),
         ToolSubst("%mlir_src_dir", config.mlir_src_root, unresolved="ignore"),
     ]
diff --git a/mlir/test/mlir-cpu-runner/expand-arith-ops.mlir b/mlir/test/mlir-cpu-runner/expand-arith-ops.mlir
index 44141cc4eeaf..0bf6523c5e5d 100644
--- a/mlir/test/mlir-cpu-runner/expand-arith-ops.mlir
+++ b/mlir/test/mlir-cpu-runner/expand-arith-ops.mlir
@@ -13,10 +13,21 @@ func.func @trunc_bf16(%a : f32) {
 }
 
 func.func @main() {
-  // CHECK: 1.00781
-  %roundOneI = arith.constant 0x3f808000 : i32
-  %roundOneF = arith.bitcast %roundOneI : i32 to f32
-  call @trunc_bf16(%roundOneF): (f32) -> ()
+  // Note: this is a tie (low 16 bits are 0x8000). We expect the rounding behavior
+  // to break ties "to nearest-even", which in this case means downwards,
+  // since bit 16 is not set.
+  // CHECK: 1
+  %value_1_00391_I = arith.constant 0x3f808000 : i32
+  %value_1_00391_F = arith.bitcast %value_1_00391_I : i32 to f32
+  call @trunc_bf16(%value_1_00391_F): (f32) -> ()
+
+  // Note: this is a tie (low 16 bits are 0x8000). We expect the rounding behavior
+  // to break ties "to nearest-even", which in this case means upwards,
+  // since bit 16 is set.
+  // CHECK-NEXT: 1.01562
+  %value_1_01172_I = arith.constant 0x3f818000 : i32
+  %value_1_01172_F = arith.bitcast %value_1_01172_I : i32 to f32
+  call @trunc_bf16(%value_1_01172_F): (f32) -> ()
 
   // CHECK-NEXT: -1
   %noRoundNegOneI = arith.constant 0xbf808000 : i32
@@ -38,15 +49,27 @@ func.func @main() {
   %neginff = arith.bitcast %neginfi : i32 to f32
   call @trunc_bf16(%neginff): (f32) -> ()
 
+  // Note: this rounds upwards. As the mantissa was already saturated, this rounding
+  // causes the exponent to be incremented. As the exponent was already the
+  // maximum exponent value for finite values, this increment of the exponent
+  // causes this to overflow to +inf.
+  // CHECK-NEXT: inf
+  %big_overflowing_i = arith.constant 0x7f7fffff : i32
+  %big_overflowing_f = arith.bitcast %big_overflowing_i : i32 to f32
+  call @trunc_bf16(%big_overflowing_f): (f32) -> ()
+
+  // Same as the previous testcase but negative.
+  // CHECK-NEXT: -inf
+  %negbig_overflowing_i = arith.constant 0xff7fffff : i32
+  %negbig_overflowing_f = arith.bitcast %negbig_overflowing_i : i32 to f32
+  call @trunc_bf16(%negbig_overflowing_f): (f32) -> ()
+
+  // In contrast to the previous two testcases, the upwards-rounding here
+  // does not cause overflow.
   // CHECK-NEXT: 3.38953e+38
-  %bigi = arith.constant 0x7f7fffff : i32
-  %bigf = arith.bitcast %bigi : i32 to f32
-  call @trunc_bf16(%bigf): (f32) -> ()
-
-  // CHECK-NEXT: -3.38953e+38
-  %negbigi = arith.constant 0xff7fffff : i32
-  %negbigf = arith.bitcast %negbigi : i32 to f32
-  call @trunc_bf16(%negbigf): (f32) -> ()
+  %big_nonoverflowing_i = arith.constant 0x7f7effff : i32
+  %big_nonoverflowing_f = arith.bitcast %big_nonoverflowing_i : i32 to f32
+  call @trunc_bf16(%big_nonoverflowing_f): (f32) -> ()
 
   // CHECK-NEXT: 1.625
   %exprolli = arith.constant 0x3fcfffff : i32
diff --git a/mlir/test/python/dialects/sparse_tensor/dialect.py b/mlir/test/python/dialects/sparse_tensor/dialect.py
index 2c0603216ef2..5666d090c3d5 100644
--- a/mlir/test/python/dialects/sparse_tensor/dialect.py
+++ b/mlir/test/python/dialects/sparse_tensor/dialect.py
@@ -28,7 +28,7 @@ def testEncodingAttr1D():
         # CHECK: equal: True
         print(f"equal: {casted == parsed}")
 
-        # CHECK: lvl_types: [131072]
+        # CHECK: lvl_types: [262144]
         print(f"lvl_types: {casted.lvl_types}")
         # CHECK: dim_to_lvl: (d0) -> (d0)
         print(f"dim_to_lvl: {casted.dim_to_lvl}")
@@ -71,9 +71,9 @@ def testEncodingAttrStructure():
         # CHECK: equal: True
         print(f"equal: {casted == parsed}")
 
-        # CHECK: lvl_types: [65536, 65536, 4406637494272]
+        # CHECK: lvl_types: [65536, 65536, 4406638542848]
         print(f"lvl_types: {casted.lvl_types}")
-        # CHECK: lvl_formats_enum: [<LevelFormat.dense: 65536>, <LevelFormat.dense: 65536>, <LevelFormat.n_out_of_m: 1048576>]
+        # CHECK: lvl_formats_enum: [<LevelFormat.dense: 65536>, <LevelFormat.dense: 65536>, <LevelFormat.n_out_of_m: 2097152>]
         print(f"lvl_formats_enum: {casted.lvl_formats_enum}")
         # CHECK: structured_n: 2
         print(f"structured_n: {casted.structured_n}")
@@ -157,7 +157,7 @@ def testEncodingAttr2D():
         # CHECK: equal: True
         print(f"equal: {casted == parsed}")
 
-        # CHECK: lvl_types: [65536, 131072]
+        # CHECK: lvl_types: [65536, 262144]
         print(f"lvl_types: {casted.lvl_types}")
         # CHECK: dim_to_lvl: (d0, d1) -> (d1, d0)
         print(f"dim_to_lvl: {casted.dim_to_lvl}")
diff --git a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
index 62a19c084cac..943e7d5c120b 100644
--- a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
+++ b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
@@ -120,7 +120,8 @@ static Match synZeroMatch() { return Match(); }
 FOREVERY_BINOP(IMPL_BINOP_PATTERN)
 #undef IMPL_BINOP_PATTERN
 
-class MergerTestBase : public ::testing::Test {
+// Parameterize LevelFormat to test both Dense and Batch LevelFormat.
+class MergerTestBase : public ::testing::TestWithParam<LevelFormat> {
 protected:
   MergerTestBase(unsigned numTensors, unsigned numLoops)
       : merger(numTensors, numLoops, /*maxRank=*/numLoops) {
@@ -317,10 +318,14 @@ protected:
     // Tensor 1: sparse input vector.
     merger.setLevelAndType(tid(1), lid(0), 0, LevelFormat::Compressed);
     // Tensor 2: dense output vector.
-    merger.setLevelAndType(tid(2), lid(0), 0, LevelFormat::Dense);
+    merger.setLevelAndType(tid(2), lid(0), 0, GetParam());
   }
 };
 
+INSTANTIATE_TEST_SUITE_P(Test3T1L, MergerTest3T1L,
+                         ::testing::Values(LevelFormat::Dense,
+                                           LevelFormat::Batch));
+
 /// Four tensors (three inputs, one output); and a single loop.
 class MergerTest4T1L : public MergerTestBase {
 protected:
@@ -333,10 +338,14 @@ protected:
     // Tensor 2: sparse input vector
     merger.setLevelAndType(tid(2), lid(0), 0, LevelFormat::Compressed);
     // Tensor 3: dense output vector
-    merger.setLevelAndType(tid(3), lid(0), 0, LevelFormat::Dense);
+    merger.setLevelAndType(tid(3), lid(0), 0, GetParam());
   }
 };
 
+INSTANTIATE_TEST_SUITE_P(Test4T1L, MergerTest4T1L,
+                         ::testing::Values(LevelFormat::Dense,
+                                           LevelFormat::Batch));
+
 ///
 /// Tests with both sparse and dense input.
 ///
@@ -349,12 +358,16 @@ protected:
     // Tensor 0: sparse input vector.
     merger.setLevelAndType(tid(0), lid(0), 0, LevelFormat::Compressed);
     // Tensor 1: dense input vector.
-    merger.setLevelAndType(tid(1), lid(0), 0, LevelFormat::Dense);
+    merger.setLevelAndType(tid(1), lid(0), 0, GetParam());
     // Tensor 2: dense output vector.
-    merger.setLevelAndType(tid(2), lid(0), 0, LevelFormat::Dense);
+    merger.setLevelAndType(tid(2), lid(0), 0, GetParam());
   }
 };
 
+INSTANTIATE_TEST_SUITE_P(Test3T1LD, MergerTest3T1LD,
+                         ::testing::Values(LevelFormat::Dense,
+                                           LevelFormat::Batch));
+
 ///
 /// Tests with both undef and dense input.
 ///
@@ -367,14 +380,18 @@ protected:
     // Tensor 0: undef input vector.
     merger.setLevelAndType(tid(0), lid(0), 0, LevelFormat::Undef);
     // Tensor 1: dense input vector.
-    merger.setLevelAndType(tid(1), lid(0), 0, LevelFormat::Dense);
+    merger.setLevelAndType(tid(1), lid(0), 0, GetParam());
     // Tensor 2: undef input vector.
     merger.setLevelAndType(tid(2), lid(0), 0, LevelFormat::Undef);
     // Tensor 3: dense output vector.
-    merger.setLevelAndType(tid(3), lid(0), 0, LevelFormat::Dense);
+    merger.setLevelAndType(tid(3), lid(0), 0, GetParam());
   }
 };
 
+INSTANTIATE_TEST_SUITE_P(Test4T1LU, MergerTest4T1LU,
+                         ::testing::Values(LevelFormat::Dense,
+                                           LevelFormat::Batch));
+
 ///
 /// Tests with operation on sparse output.
 ///
@@ -395,6 +412,11 @@ protected:
   }
 };
 
+// This testsuite does not use any dense-like format, just one of {Dense, Batch}
+// is enough.
+INSTANTIATE_TEST_SUITE_P(Test3T1LSo, MergerTest3T1LSo,
+                         ::testing::Values(LevelFormat::Dense));
+
 } // namespace
 
 /// Vector multiplication (conjunction) of 3 vectors, i.e.;
@@ -409,7 +431,7 @@ protected:
 ///   lat( i_01_D / (tensor_0 * tensor_1 * tensor2) )
 /// }
 #define IMPL_MERGER_TEST_CONJ_CONJ_UNDEF(CONJ1, CONJ2)                         \
-  TEST_F(MergerTest4T1LU, vector_##CONJ1##_##CONJ2) {                          \
+  TEST_P(MergerTest4T1LU, vector_##CONJ1##_##CONJ2) {                          \
     const auto em = CONJ1##Expr(tensor(0), tensor(1));                         \
     const auto e = CONJ2##Expr(em, tensor(2));                                 \
     const auto l0 = lid(0);                                                    \
@@ -443,7 +465,7 @@ FOREVERY_PAIR_OF_COMMON_CONJ_CONJ_BINOP(IMPL_MERGER_TEST_CONJ_CONJ_UNDEF)
 ///   lat( i_03_U / (tensor_0 * tensor_1 * output_tensor2) )
 /// }
 #define IMPL_MERGER_TEST_CONJ_CONJ_SPARSE_OUT(CONJ1, CONJ2)                    \
-  TEST_F(MergerTest3T1LSo, vector_##CONJ1##_##CONJ2) {                         \
+  TEST_P(MergerTest3T1LSo, vector_##CONJ1##_##CONJ2) {                         \
     const auto em = CONJ1##Expr(tensor(0), tensor(1));                         \
     const auto e = CONJ2##Expr(em, tensor(2));                                 \
     const auto l0 = lid(0);                                                    \
@@ -482,7 +504,7 @@ FOREVERY_PAIR_OF_COMMON_CONJ_CONJ_BINOP(IMPL_MERGER_TEST_CONJ_CONJ_SPARSE_OUT)
 ///   lat( i_01 / tensor_1 )
 /// }
 #define IMPL_MERGER_TEST_DISJ(OP, UNUSED)                                      \
-  TEST_F(MergerTest3T1L, vector_##OP) {                                        \
+  TEST_P(MergerTest3T1L, vector_##OP) {                                        \
     const auto e = OP##Expr(tensor(0), tensor(1));                             \
     const auto l0 = lid(0);                                                    \
     const auto t0 = tid(0);                                                    \
@@ -514,7 +536,7 @@ FOREVERY_COMMON_DISJ_BINOP(IMPL_MERGER_TEST_DISJ)
 ///   lat( i_00 i_01 / (tensor_0 * tensor_1) )
 /// }
 #define IMPL_MERGER_TEST_CONJ(OP, UNUSED)                                      \
-  TEST_F(MergerTest3T1L, vector_##OP) {                                        \
+  TEST_P(MergerTest3T1L, vector_##OP) {                                        \
     const auto e = OP##Expr(tensor(0), tensor(1));                             \
     const auto l0 = lid(0);                                                    \
     const auto t0 = tid(0);                                                    \
@@ -544,7 +566,7 @@ FOREVERY_COMMON_CONJ_BINOP(IMPL_MERGER_TEST_CONJ)
 ///    lat( i_02 / tensor_2 )
 /// }
 #define IMPL_MERGER_TEST_CONJ_DISJ(CONJ, DISJ)                                 \
-  TEST_F(MergerTest4T1L, vector_##CONJ##_##DISJ) {                             \
+  TEST_P(MergerTest4T1L, vector_##CONJ##_##DISJ) {                             \
     const auto em = CONJ##Expr(tensor(0), tensor(1));                          \
     const auto e = DISJ##Expr(em, tensor(2));                                  \
     const auto l0 = lid(0);                                                    \
@@ -587,7 +609,7 @@ FOREVERY_PAIR_OF_COMMON_CONJ_DISJ_BINOP(IMPL_MERGER_TEST_CONJ_DISJ)
 ///   lat( i_00 / tensor_0 )
 /// }
 #define IMPL_MERGER_TEST_DISJ_DISJ(DISJ1, DISJ2)                               \
-  TEST_F(MergerTest4T1L, Vector_##DISJ1##_##DISJ2) {                           \
+  TEST_P(MergerTest4T1L, Vector_##DISJ1##_##DISJ2) {                           \
     const auto em = DISJ1##Expr(tensor(0), tensor(1));                         \
     const auto e = DISJ2##Expr(em, tensor(2));                                 \
     const auto l0 = lid(0);                                                    \
@@ -636,7 +658,7 @@ FOREVERY_PAIR_OF_COMMON_DISJ_DISJ_BINOP(IMPL_MERGER_TEST_DISJ_DISJ)
 ///    lat( i_00 i_01 i_02 / tensor_0 * tensor_1 * tensor_2 )
 /// }
 #define IMPL_MERGER_TEST_CONJ_CONJ(CONJ1, CONJ2)                               \
-  TEST_F(MergerTest4T1L, vector_##CONJ1##_##CONJ2) {                           \
+  TEST_P(MergerTest4T1L, vector_##CONJ1##_##CONJ2) {                           \
     const auto em = CONJ1##Expr(tensor(0), tensor(1));                         \
     const auto e = CONJ2##Expr(em, tensor(2));                                 \
     const auto l0 = lid(0);                                                    \
@@ -675,7 +697,7 @@ FOREVERY_PAIR_OF_COMMON_CONJ_CONJ_BINOP(IMPL_MERGER_TEST_CONJ_CONJ)
 /// lat( i_00 / sparse_tensor_0 ) should be opted out as it only has dense diff
 /// with lat( i_00 i_01 / (sparse_tensor_0 + dense_tensor_1) ).
 #define IMPL_MERGER_TEST_OPTIMIZED_DISJ(OP, UNUSED)                            \
-  TEST_F(MergerTest3T1LD, vector_opted_##OP) {                                 \
+  TEST_P(MergerTest3T1LD, vector_opted_##OP) {                                 \
     const auto e = OP##Expr(tensor(0), tensor(1));                             \
     const auto l0 = lid(0);                                                    \
     const auto t0 = tid(0);                                                    \
@@ -711,7 +733,7 @@ FOREVERY_COMMON_DISJ_BINOP(IMPL_MERGER_TEST_OPTIMIZED_DISJ)
 /// }
 /// since i_01 is a dense dimension.
 #define IMPL_MERGER_TEST_OPTIMIZED_CONJ(OP, UNUSED)                            \
-  TEST_F(MergerTest3T1LD, vector_opted_##OP) {                                 \
+  TEST_P(MergerTest3T1LD, vector_opted_##OP) {                                 \
     const auto e = OP##Expr(tensor(0), tensor(1));                             \
     const auto l0 = lid(0);                                                    \
     const auto t0 = tid(0);                                                    \
@@ -746,7 +768,7 @@ FOREVERY_COMMON_CONJ_BINOP(IMPL_MERGER_TEST_OPTIMIZED_CONJ)
 ///   lat( i_00 / tensor_0 cmp 0 )
 ///   lat( i_01 / 0 cmp tensor_1 )
 /// }
-TEST_F(MergerTest3T1L, vector_cmp) {
+TEST_P(MergerTest3T1L, vector_cmp) {
   const auto e = cmpiExpr(tensor(0), tensor(1));
   const auto l0 = lid(0);
   const auto t0 = tid(0);
@@ -784,7 +806,7 @@ TEST_F(MergerTest3T1L, vector_cmp) {
 ///
 /// lat( i_00 / sparse_tensor_0 ) should be opted out as it only has dense diff
 /// with lat( i_00 i_01 / (sparse_tensor_0 cmp dense_tensor_1) ).
-TEST_F(MergerTest3T1LD, vector_cmp) {
+TEST_P(MergerTest3T1LD, vector_cmp) {
   const auto e = cmpiExpr(tensor(0), tensor(1));
   const auto l0 = lid(0);
   const auto t0 = tid(0);
diff --git a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
index 26bfbd5c11e8..cea49356538f 100644
--- a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
+++ b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Config/mlir-config.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/IR/MLIRContext.h"
@@ -29,10 +30,10 @@
 using namespace mlir;
 
 // Skip the test if the NVPTX target was not built.
-#if MLIR_CUDA_CONVERSIONS_ENABLED == 0
-#define SKIP_WITHOUT_NVPTX(x) DISABLED_##x
-#else
+#if MLIR_ENABLE_CUDA_CONVERSIONS
 #define SKIP_WITHOUT_NVPTX(x) x
+#else
+#define SKIP_WITHOUT_NVPTX(x) DISABLED_##x
 #endif
 
 class MLIRTargetLLVMNVVM : public ::testing::Test {
diff --git a/openmp/libompd/src/CMakeLists.txt b/openmp/libompd/src/CMakeLists.txt
index 0402a0177201..ba228d518104 100644
--- a/openmp/libompd/src/CMakeLists.txt
+++ b/openmp/libompd/src/CMakeLists.txt
@@ -13,7 +13,8 @@ cmake_minimum_required(VERSION 3.20.0)
 
 add_library (ompd SHARED TargetValue.cpp omp-debug.cpp omp-state.cpp omp-icv.cpp)
 
-target_link_libraries(ompd omp) # ensure generated import library is created first
+# libompd must not link against libomp, there is no code dependency.
+add_dependencies(ompd omp) # ensure generated import library is created first
 
 set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 
diff --git a/openmp/libompd/src/omp-icv.cpp b/openmp/libompd/src/omp-icv.cpp
index 4a2c2b6e8bc6..0288e963a697 100644
--- a/openmp/libompd/src/omp-icv.cpp
+++ b/openmp/libompd/src/omp-icv.cpp
@@ -18,7 +18,9 @@
 #include "omp.h"
 #include "ompd-private.h"
 #include "TargetValue.h"
+#define OMPD_SKIP_HWLOC 1
 #include "kmp.h"
+#undef OMPD_SKIP_HWLOC
 #include <cstring>
 
 /* The ICVs ompd-final-var and ompd-implicit-var below are for backward
diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp
index 40f99e07b44c..a1e4fa2449d9 100644
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -426,6 +426,8 @@ int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); }
 int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
 
 int omp_get_initial_device(void) { return -1; }
+
+int omp_is_initial_device(void) { return 0; }
 }
 
 extern "C" {
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 259c57b5afbc..121e7e959129 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -103,7 +103,8 @@ class kmp_stats_list;
 #define KMP_USE_HIER_SCHED KMP_AFFINITY_SUPPORTED
 #endif
 
-#if KMP_USE_HWLOC && KMP_AFFINITY_SUPPORTED
+// OMPD_SKIP_HWLOC used in libompd/omp-icv.cpp to avoid OMPD depending on hwloc
+#if KMP_USE_HWLOC && KMP_AFFINITY_SUPPORTED && !defined(OMPD_SKIP_HWLOC)
 #include "hwloc.h"
 #ifndef HWLOC_OBJ_NUMANODE
 #define HWLOC_OBJ_NUMANODE HWLOC_OBJ_NODE
@@ -689,7 +690,7 @@ typedef BOOL (*kmp_SetThreadGroupAffinity_t)(HANDLE, const GROUP_AFFINITY *,
 extern kmp_SetThreadGroupAffinity_t __kmp_SetThreadGroupAffinity;
 #endif /* KMP_OS_WINDOWS */
 
-#if KMP_USE_HWLOC
+#if KMP_USE_HWLOC && !defined(OMPD_SKIP_HWLOC)
 extern hwloc_topology_t __kmp_hwloc_topology;
 extern int __kmp_hwloc_error;
 #endif
diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
index fc5e8405a415..7edb0b440acc 100644
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -5708,9 +5708,8 @@ void __kmp_free_team(kmp_root_t *root,
           }
 #endif
           // first check if thread is sleeping
-          kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
-          if (fl.is_sleeping())
-            fl.resume(__kmp_gtid_from_thread(th));
+          if (th->th.th_sleep_loc)
+            __kmp_null_resume_wrapper(th);
           KMP_CPU_PAUSE();
         }
       }
diff --git a/openmp/runtime/test/barrier/llvm-issue-80664.c b/openmp/runtime/test/barrier/llvm-issue-80664.c
new file mode 100644
index 000000000000..79aa228afa6b
--- /dev/null
+++ b/openmp/runtime/test/barrier/llvm-issue-80664.c
@@ -0,0 +1,37 @@
+// RUN: %libomp-compile
+// RUN: env OMP_WAIT_POLICY=passive \
+// RUN:     KMP_FORKJOIN_BARRIER_PATTERN='linear,linear' %libomp-run
+// RUN: env OMP_WAIT_POLICY=passive \
+// RUN:     KMP_FORKJOIN_BARRIER_PATTERN='tree,tree' %libomp-run
+// RUN: env OMP_WAIT_POLICY=passive \
+// RUN:     KMP_FORKJOIN_BARRIER_PATTERN='hyper,hyper' %libomp-run
+// RUN: env OMP_WAIT_POLICY=passive \
+// RUN:     KMP_FORKJOIN_BARRIER_PATTERN='dist,dist' %libomp-run
+//
+// LLVM ISSUE 80664: https://github.com/llvm/llvm-project/issues/80664
+//
+// Distributed barrier + OMP_WAIT_POLICY=passive hangs in library termination
+// Reason: the resume logic in __kmp_free_team() was faulty and, when checking
+// for sleep status, didn't look at correct location for distributed barrier.
+
+#include <stdio.h>
+#include <stdlib.h>
+
+int a = 0;
+
+void test_omp_barrier() {
+#pragma omp parallel
+  {
+#pragma omp task
+    {
+#pragma omp atomic
+      a++;
+    }
+  }
+}
+
+int main() {
+  test_omp_barrier();
+  printf("a = %d\n", a);
+  return EXIT_SUCCESS;
+}
diff --git a/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp b/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp
index 776aee9d8e2c..3f2ceef0c4ad 100644
--- a/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp
+++ b/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp
@@ -1,4 +1,7 @@
 // RUN: %libomp-cxx-compile-and-run
+//
+// AIX runs out of resource in 32-bit with 4*omp_get_max_threads() threads.
+// XFAIL: aix && ppc
 
 #include <omp.h>
 
diff --git a/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp b/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp
index b551318a72f3..f7405d00255c 100644
--- a/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp
+++ b/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp
@@ -1,4 +1,7 @@
 // RUN: %libomp-cxx-compile-and-run
+//
+// AIX runs out of resource in 32-bit with 4*omp_get_max_threads() threads.
+// XFAIL: aix && ppc
 
 #include <omp.h>
 
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 09c53c9e8a13..a1a5b7fe9bf4 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -2982,20 +2982,21 @@ libc_function(
 ################################ stdio targets #################################
 
 libc_support_library(
-    name = "printf_core_structs",
-    hdrs = ["src/stdio/printf_core/core_structs.h"],
+    name = "printf_config",
+    hdrs = ["src/stdio/printf_core/printf_config.h"],
     defines = PRINTF_COPTS,
     deps = [
-        ":__support_cpp_string_view",
-        ":__support_fputil_fp_bits",
     ],
 )
 
 libc_support_library(
-    name = "printf_config",
-    hdrs = ["src/stdio/printf_core/printf_config.h"],
+    name = "printf_core_structs",
+    hdrs = ["src/stdio/printf_core/core_structs.h"],
     defines = PRINTF_COPTS,
     deps = [
+        ":__support_cpp_string_view",
+        ":__support_fputil_fp_bits",
+        ":printf_config",
     ],
 )
 
@@ -3081,6 +3082,7 @@ libc_support_library(
         ":__support_libc_assert",
         ":__support_uint",
         ":__support_uint128",
+        ":printf_config",
         ":printf_core_structs",
         ":printf_writer",
     ],
diff --git a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
index 17eb30c8e458..7d815bc4a229 100644
--- a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
+++ b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
@@ -17,6 +17,7 @@ def libc_common_copts():
     libc_include_path = paths.join(root_label.workspace_root, root_label.package)
     return [
         "-I" + libc_include_path,
+        "-I" + paths.join(libc_include_path, "include"),
         "-DLIBC_NAMESPACE=" + LIBC_NAMESPACE,
     ]
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 59ee03d9a321..7860ccd0406a 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -6,7 +6,6 @@
 #   The MLIR "Multi-Level Intermediate Representation" Compiler Infrastructure
 
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-load("@bazel_skylib//rules:write_file.bzl", "write_file")
 load(
     ":build_defs.bzl",
     "cc_headers_only",
@@ -36,7 +35,10 @@ expand_template(
         "#cmakedefine01 MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS": "#define MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS 0",
         "#cmakedefine MLIR_GREEDY_REWRITE_RANDOMIZER_SEED ${MLIR_GREEDY_REWRITE_RANDOMIZER_SEED}": "/* #undef MLIR_GREEDY_REWRITE_RANDOMIZER_SEED */",
         "#cmakedefine01 MLIR_ENABLE_PDL_IN_PATTERNMATCH": "#define MLIR_ENABLE_PDL_IN_PATTERNMATCH 1",
-    },
+    } | if_cuda_available(
+        {"#cmakedefine01 MLIR_ENABLE_CUDA_CONVERSIONS": "#define MLIR_ENABLE_CUDA_CONVERSIONS 1"},
+        {"#cmakedefine01 MLIR_ENABLE_CUDA_CONVERSIONS": "#define MLIR_ENABLE_CUDA_CONVERSIONS 0"},
+    ),
     template = "include/mlir/Config/mlir-config.h.cmake",
 )
 
@@ -4109,6 +4111,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":AffineDialect",
+        ":AffineTransforms",
         ":AffineUtils",
         ":ArithDialect",
         ":ConversionPassIncGen",
@@ -5468,7 +5471,6 @@ cc_library(
     srcs = ["lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp"],
     hdrs = ["include/mlir/Dialect/GPU/Pipelines/Passes.h"],
     includes = ["include"],
-    local_defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     deps = [
         ":AffineToStandard",
         ":ArithToLLVM",
@@ -5492,6 +5494,7 @@ cc_library(
         ":Transforms",
         ":VectorToLLVM",
         ":VectorToSCF",
+        ":config",
     ],
 )
 
@@ -5541,6 +5544,7 @@ cc_library(
         ":Transforms",
         ":VCIXToLLVMIRTranslation",
         ":VectorDialect",
+        ":config",
         "//llvm:Core",
         "//llvm:MC",
         "//llvm:Support",
@@ -6176,6 +6180,7 @@ cc_library(
         ":NVVMToLLVMIRTranslation",
         ":TargetLLVM",
         ":ToLLVMIRTranslation",
+        ":config",
         "//llvm:NVPTXCodeGen",
         "//llvm:Support",
     ],
@@ -9131,6 +9136,7 @@ cc_library(
         ":VectorTransforms",
         ":X86VectorDialect",
         ":X86VectorTransforms",
+        ":config",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index 68d9b23fd564..583411aa60e5 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -552,7 +552,6 @@ cc_library(
 cc_library(
     name = "TestTransforms",
     srcs = glob(["lib/Transforms/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         ":TestDialect",
@@ -579,7 +578,6 @@ cc_library(
 cc_library(
     name = "TestFuncToLLVM",
     srcs = glob(["lib/Conversion/FuncToLLVM/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         ":TestDialect",
@@ -594,7 +592,6 @@ cc_library(
 cc_library(
     name = "TestOneToNTypeConversion",
     srcs = glob(["lib/Conversion/OneToNTypeConversion/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         ":TestDialect",
@@ -653,7 +650,6 @@ cc_library(
 cc_library(
     name = "TestDLTI",
     srcs = glob(["lib/Dialect/DLTI/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         ":TestDialect",
@@ -667,7 +663,7 @@ cc_library(
 cc_library(
     name = "TestGPU",
     srcs = glob(["lib/Dialect/GPU/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"] + if_cuda_available([
+    defines = if_cuda_available([
         "MLIR_GPU_TO_CUBIN_PASS_ENABLE",
     ]),
     includes = ["lib/Dialect/Test"],
@@ -714,7 +710,6 @@ cc_library(
 cc_library(
     name = "TestLinalg",
     srcs = glob(["lib/Dialect/Linalg/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//llvm:Support",
@@ -748,7 +743,6 @@ cc_library(
 cc_library(
     name = "TestLLVM",
     srcs = glob(["lib/Dialect/LLVM/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:AffineToStandard",
@@ -773,7 +767,6 @@ cc_library(
 cc_library(
     name = "TestMath",
     srcs = glob(["lib/Dialect/Math/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:ArithDialect",
@@ -790,7 +783,6 @@ cc_library(
 cc_library(
     name = "TestMathToVCIX",
     srcs = glob(["lib/Conversion/MathToVCIX/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:ArithDialect",
@@ -807,7 +799,6 @@ cc_library(
 cc_library(
     name = "TestMemRef",
     srcs = glob(["lib/Dialect/MemRef/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         ":TestDialect",
@@ -847,7 +838,6 @@ cc_library(
 cc_library(
     name = "TestNVGPU",
     srcs = glob(["lib/Dialect/NVGPU/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:AffineDialect",
@@ -871,7 +861,6 @@ cc_library(
 cc_library(
     name = "TestSCF",
     srcs = glob(["lib/Dialect/SCF/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//llvm:Support",
@@ -891,7 +880,6 @@ cc_library(
 cc_library(
     name = "TestArith",
     srcs = glob(["lib/Dialect/Arith/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:ArithDialect",
@@ -908,7 +896,6 @@ cc_library(
 cc_library(
     name = "TestArmSME",
     srcs = glob(["lib/Dialect/ArmSME/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:ArithToArmSME",
@@ -927,7 +914,6 @@ cc_library(
 cc_library(
     name = "TestBufferization",
     srcs = glob(["lib/Dialect/Bufferization/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:BufferizationDialect",
@@ -989,7 +975,6 @@ cc_library(
 cc_library(
     name = "TestFunc",
     srcs = glob(["lib/Dialect/Func/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         ":TestDialect",
@@ -1005,7 +990,6 @@ cc_library(
 cc_library(
     name = "TestTensor",
     srcs = glob(["lib/Dialect/Tensor/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:ArithDialect",
@@ -1022,7 +1006,6 @@ cc_library(
 cc_library(
     name = "TestVector",
     srcs = glob(["lib/Dialect/Vector/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:AffineDialect",
author	Paul Kirth <paulkirth@google.com>	2024-02-28 22:33:21 +0000
committer	Paul Kirth <paulkirth@google.com>	2024-02-28 22:33:21 +0000
commit	25b6b3817ad385aad4f55c38537b811e61eec608 (patch)
tree	d58e0596746a2532d6c2a7de701b9982e745fb63
parent	b2ea075b59df25e93d21a57da9da146081b3408a (diff)
parent	777ac46ddbc318b5d5820d278a2e4dc2213699d8 (diff)