[𝘀𝗽𝗿] changes introduced through rebaseupstream/users/spr/aaupov/main.boltnfci-use-heuristic-for-matching-split-global-functions

Created using spr 1.3.4 [skip ci]
author: Amir Ayupov <aaupov@fb.com> 2024-04-29 16:27:24 -0700
committer: Amir Ayupov <aaupov@fb.com> 2024-04-29 16:27:24 -0700
commit: 423bc644a2ebb0408d389c5818aee52b11fefe9c (patch)
tree: 4b44b1acfda8a2189336f9277dac00dcc975fe29
parent: b7457cc79e1dcd44cfe1448b0a5f0abe3c66f398 (diff)
parent: c4c4e17c99f8d4f79bb9e1e3819d1d76e5e09ed1 (diff)
494 files changed, 14603 insertions, 6250 deletions
diff --git a/bolt/docs/BAT.md b/bolt/docs/BAT.md
index f23ef1abf876..7ffb5d7c0081 100644
--- a/bolt/docs/BAT.md
+++ b/bolt/docs/BAT.md
@@ -81,9 +81,10 @@ Hot indices are delta encoded, implicitly starting at zero.
 | `FuncHash` | 8b | Function hash for input function | Hot |
 | `NumBlocks` | ULEB128 | Number of basic blocks in the original function | Hot |
 | `NumSecEntryPoints` | ULEB128 | Number of secondary entry points in the original function | Hot |
+| `ColdInputSkew` | ULEB128 | Skew to apply to all input offsets | Cold |
 | `NumEntries` | ULEB128 | Number of address translation entries for a function | Both |
-| `EqualElems` | ULEB128 | Number of equal offsets in the beginning of a function | Hot |
-| `BranchEntries` | Bitmask, `alignTo(EqualElems, 8)` bits | If `EqualElems` is non-zero, bitmask denoting entries with `BRANCHENTRY` bit | Hot |
+| `EqualElems` | ULEB128 | Number of equal offsets in the beginning of a function | Both |
+| `BranchEntries` | Bitmask, `alignTo(EqualElems, 8)` bits | If `EqualElems` is non-zero, bitmask denoting entries with `BRANCHENTRY` bit | Both |
 
 Function header is followed by *Address Translation Table* with `NumEntries`
 total entries, and *Secondary Entry Points* table with `NumSecEntryPoints`
@@ -99,8 +100,8 @@ entry is encoded. Input offsets implicitly start at zero.
 | `BBHash` | Optional, 8b | Basic block hash in input binary | BB |
 | `BBIdx`  | Optional, Delta, ULEB128 | Basic block index in input binary | BB |
 
-For hot fragments, the table omits the first `EqualElems` input offsets
-where the input offset equals output offset.
+The table omits the first `EqualElems` input offsets where the input offset
+equals output offset.
 
 `BRANCHENTRY` bit denotes whether a given offset pair is a control flow source
 (branch or call instruction). If not set, it signifies a control flow target
diff --git a/bolt/include/bolt/Profile/BoltAddressTranslation.h b/bolt/include/bolt/Profile/BoltAddressTranslation.h
index eef05e8a0e68..68b993ee363c 100644
--- a/bolt/include/bolt/Profile/BoltAddressTranslation.h
+++ b/bolt/include/bolt/Profile/BoltAddressTranslation.h
@@ -149,9 +149,9 @@ private:
   /// entries in function address translation map.
   APInt calculateBranchEntriesBitMask(MapTy &Map, size_t EqualElems);
 
-  /// Calculate the number of equal offsets (output = input) in the beginning
-  /// of the function.
-  size_t getNumEqualOffsets(const MapTy &Map) const;
+  /// Calculate the number of equal offsets (output = input - skew) in the
+  /// beginning of the function.
+  size_t getNumEqualOffsets(const MapTy &Map, uint32_t Skew) const;
 
   std::map<uint64_t, MapTy> Maps;
 
diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h
index d37e6f5ed859..41a92e7ba01e 100644
--- a/bolt/include/bolt/Rewrite/RewriteInstance.h
+++ b/bolt/include/bolt/Rewrite/RewriteInstance.h
@@ -422,6 +422,10 @@ private:
   /// Section name used for extra BOLT code in addition to .text.
   static StringRef getBOLTTextSectionName() { return ".bolt.text"; }
 
+  /// Symbol markers for BOLT reserved area.
+  static StringRef getBOLTReservedStart() { return "__bolt_reserved_start"; }
+  static StringRef getBOLTReservedEnd() { return "__bolt_reserved_end"; }
+
   /// Common section names.
   static StringRef getEHFrameSectionName() { return ".eh_frame"; }
   static StringRef getEHFrameHdrSectionName() { return ".eh_frame_hdr"; }
diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp
index 0141ce189acd..7cfb9c132c2c 100644
--- a/bolt/lib/Profile/BoltAddressTranslation.cpp
+++ b/bolt/lib/Profile/BoltAddressTranslation.cpp
@@ -153,12 +153,13 @@ APInt BoltAddressTranslation::calculateBranchEntriesBitMask(MapTy &Map,
   return BitMask;
 }
 
-size_t BoltAddressTranslation::getNumEqualOffsets(const MapTy &Map) const {
+size_t BoltAddressTranslation::getNumEqualOffsets(const MapTy &Map,
+                                                  uint32_t Skew) const {
   size_t EqualOffsets = 0;
   for (const std::pair<const uint32_t, uint32_t> &KeyVal : Map) {
     const uint32_t OutputOffset = KeyVal.first;
     const uint32_t InputOffset = KeyVal.second >> 1;
-    if (OutputOffset == InputOffset)
+    if (OutputOffset == InputOffset - Skew)
       ++EqualOffsets;
     else
       break;
@@ -196,12 +197,17 @@ void BoltAddressTranslation::writeMaps(std::map<uint64_t, MapTy> &Maps,
         SecondaryEntryPointsMap.count(Address)
             ? SecondaryEntryPointsMap[Address].size()
             : 0;
+    uint32_t Skew = 0;
     if (Cold) {
       auto HotEntryIt = Maps.find(ColdPartSource[Address]);
       assert(HotEntryIt != Maps.end());
       size_t HotIndex = std::distance(Maps.begin(), HotEntryIt);
       encodeULEB128(HotIndex - PrevIndex, OS);
       PrevIndex = HotIndex;
+      // Skew of all input offsets for cold fragments is simply the first input
+      // offset.
+      Skew = Map.begin()->second >> 1;
+      encodeULEB128(Skew, OS);
     } else {
       // Function hash
       size_t BFHash = getBFHash(HotInputAddress);
@@ -217,24 +223,21 @@ void BoltAddressTranslation::writeMaps(std::map<uint64_t, MapTy> &Maps,
                         << '\n');
     }
     encodeULEB128(NumEntries, OS);
-    // For hot fragments only: encode the number of equal offsets
-    // (output = input) in the beginning of the function. Only encode one offset
-    // in these cases.
-    const size_t EqualElems = Cold ? 0 : getNumEqualOffsets(Map);
-    if (!Cold) {
-      encodeULEB128(EqualElems, OS);
-      if (EqualElems) {
-        const size_t BranchEntriesBytes = alignTo(EqualElems, 8) / 8;
-        APInt BranchEntries = calculateBranchEntriesBitMask(Map, EqualElems);
-        OS.write(reinterpret_cast<const char *>(BranchEntries.getRawData()),
-                 BranchEntriesBytes);
-        LLVM_DEBUG({
-          dbgs() << "BranchEntries: ";
-          SmallString<8> BitMaskStr;
-          BranchEntries.toString(BitMaskStr, 2, false);
-          dbgs() << BitMaskStr << '\n';
-        });
-      }
+    // Encode the number of equal offsets (output = input - skew) in the
+    // beginning of the function. Only encode one offset in these cases.
+    const size_t EqualElems = getNumEqualOffsets(Map, Skew);
+    encodeULEB128(EqualElems, OS);
+    if (EqualElems) {
+      const size_t BranchEntriesBytes = alignTo(EqualElems, 8) / 8;
+      APInt BranchEntries = calculateBranchEntriesBitMask(Map, EqualElems);
+      OS.write(reinterpret_cast<const char *>(BranchEntries.getRawData()),
+               BranchEntriesBytes);
+      LLVM_DEBUG({
+        dbgs() << "BranchEntries: ";
+        SmallString<8> BitMaskStr;
+        BranchEntries.toString(BitMaskStr, 2, false);
+        dbgs() << BitMaskStr << '\n';
+      });
     }
     const BBHashMapTy &BBHashMap = getBBHashMap(HotInputAddress);
     size_t Index = 0;
@@ -315,10 +318,12 @@ void BoltAddressTranslation::parseMaps(std::vector<uint64_t> &HotFuncs,
     uint64_t HotAddress = Cold ? 0 : Address;
     PrevAddress = Address;
     uint32_t SecondaryEntryPoints = 0;
+    uint64_t ColdInputSkew = 0;
     if (Cold) {
       HotIndex += DE.getULEB128(&Offset, &Err);
       HotAddress = HotFuncs[HotIndex];
       ColdPartSource.emplace(Address, HotAddress);
+      ColdInputSkew = DE.getULEB128(&Offset, &Err);
     } else {
       HotFuncs.push_back(Address);
       // Function hash
@@ -339,28 +344,25 @@ void BoltAddressTranslation::parseMaps(std::vector<uint64_t> &HotFuncs,
                             getULEB128Size(SecondaryEntryPoints)));
     }
     const uint32_t NumEntries = DE.getULEB128(&Offset, &Err);
-    // Equal offsets, hot fragments only.
-    size_t EqualElems = 0;
+    // Equal offsets.
+    const size_t EqualElems = DE.getULEB128(&Offset, &Err);
     APInt BEBitMask;
-    if (!Cold) {
-      EqualElems = DE.getULEB128(&Offset, &Err);
-      LLVM_DEBUG(dbgs() << formatv("Equal offsets: {0}, {1} bytes\n",
-                                   EqualElems, getULEB128Size(EqualElems)));
-      if (EqualElems) {
-        const size_t BranchEntriesBytes = alignTo(EqualElems, 8) / 8;
-        BEBitMask = APInt(alignTo(EqualElems, 8), 0);
-        LoadIntFromMemory(
-            BEBitMask,
-            reinterpret_cast<const uint8_t *>(
-                DE.getBytes(&Offset, BranchEntriesBytes, &Err).data()),
-            BranchEntriesBytes);
-        LLVM_DEBUG({
-          dbgs() << "BEBitMask: ";
-          SmallString<8> BitMaskStr;
-          BEBitMask.toString(BitMaskStr, 2, false);
-          dbgs() << BitMaskStr << ", " << BranchEntriesBytes << " bytes\n";
-        });
-      }
+    LLVM_DEBUG(dbgs() << formatv("Equal offsets: {0}, {1} bytes\n", EqualElems,
+                                 getULEB128Size(EqualElems)));
+    if (EqualElems) {
+      const size_t BranchEntriesBytes = alignTo(EqualElems, 8) / 8;
+      BEBitMask = APInt(alignTo(EqualElems, 8), 0);
+      LoadIntFromMemory(
+          BEBitMask,
+          reinterpret_cast<const uint8_t *>(
+              DE.getBytes(&Offset, BranchEntriesBytes, &Err).data()),
+          BranchEntriesBytes);
+      LLVM_DEBUG({
+        dbgs() << "BEBitMask: ";
+        SmallString<8> BitMaskStr;
+        BEBitMask.toString(BitMaskStr, 2, false);
+        dbgs() << BitMaskStr << ", " << BranchEntriesBytes << " bytes\n";
+      });
     }
     MapTy Map;
 
@@ -375,7 +377,7 @@ void BoltAddressTranslation::parseMaps(std::vector<uint64_t> &HotFuncs,
       PrevAddress = OutputAddress;
       int64_t InputDelta = 0;
       if (J < EqualElems) {
-        InputOffset = (OutputOffset << 1) | BEBitMask[J];
+        InputOffset = ((OutputOffset + ColdInputSkew) << 1) | BEBitMask[J];
       } else {
         InputDelta = DE.getSLEB128(&Offset, &Err);
         InputOffset += InputDelta;
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 8eb2e5a9d912..644d87eeca42 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -1063,6 +1063,11 @@ void RewriteInstance::discoverFileObjects() {
       continue;
     }
 
+    if (SymName == getBOLTReservedStart() || SymName == getBOLTReservedEnd()) {
+      registerName(SymbolSize);
+      continue;
+    }
+
     LLVM_DEBUG(dbgs() << "BOLT-DEBUG: considering symbol " << UniqueName
                       << " for function\n");
 
@@ -1478,11 +1483,9 @@ void RewriteInstance::registerFragments() {
     return Sec.sh_type == ELF::SHT_SYMTAB;
   });
   assert(SymTab);
-  if (!SymTab->sh_info) {
-    BC->errs() << "BOLT-ERROR: malformed SYMTAB sh_info\n";
-    exit(1);
-  }
-  ELFSymbolRef FirstGlobal = ELF64LEFile->toSymbolRef(SymTab, SymTab->sh_info);
+  // Symtab sh_info contains the value one greater than the symbol table index
+  // of the last local symbol.
+  ELFSymbolRef LocalSymEnd = ELF64LEFile->toSymbolRef(SymTab, SymTab->sh_info);
 
   for (auto &[ParentName, BF] : AmbiguousFragments) {
     const uint64_t Address = BF->getAddress();
@@ -1505,7 +1508,7 @@ void RewriteInstance::registerFragments() {
       exit(1);
     }
 
-    ELFSymbolRef StopSymbol = FirstGlobal;
+    ELFSymbolRef StopSymbol = LocalSymEnd;
     if (FSI != FileSymbols.end())
       StopSymbol = *FSI;
 
@@ -1515,10 +1518,7 @@ void RewriteInstance::registerFragments() {
     // symbol.
     for (ELFSymbolRef NextSymbol = Symbol; NextSymbol < StopSymbol;
          NextSymbol.moveNext()) {
-      Expected<StringRef> NameOrError = Symbol.getName();
-      if (!NameOrError)
-        break;
-      StringRef Name = *NameOrError;
+      StringRef Name = cantFail(NextSymbol.getName());
       if (Name == ParentName) {
         ParentAddress = cantFail(NextSymbol.getValue());
         goto registerParent;
@@ -3617,6 +3617,26 @@ void RewriteInstance::updateMetadata() {
 void RewriteInstance::mapFileSections(BOLTLinker::SectionMapper MapSection) {
   BC->deregisterUnusedSections();
 
+  // Check if the input has a space reserved for BOLT.
+  BinaryData *StartBD = BC->getBinaryDataByName(getBOLTReservedStart());
+  BinaryData *EndBD = BC->getBinaryDataByName(getBOLTReservedEnd());
+  if (!StartBD != !EndBD) {
+    BC->errs() << "BOLT-ERROR: one of the symbols is missing from the binary: "
+               << getBOLTReservedStart() << ", " << getBOLTReservedEnd()
+               << '\n';
+    exit(1);
+  }
+
+  if (StartBD) {
+    PHDRTableOffset = 0;
+    PHDRTableAddress = 0;
+    NewTextSegmentAddress = 0;
+    NewTextSegmentOffset = 0;
+    NextAvailableAddress = StartBD->getAddress();
+    BC->outs()
+        << "BOLT-INFO: using reserved space for allocating new sections\n";
+  }
+
   // If no new .eh_frame was written, remove relocated original .eh_frame.
   BinarySection *RelocatedEHFrameSection =
       getSection(".relocated" + getEHFrameSectionName());
@@ -3636,6 +3656,18 @@ void RewriteInstance::mapFileSections(BOLTLinker::SectionMapper MapSection) {
 
   // Map the rest of the sections.
   mapAllocatableSections(MapSection);
+
+  if (StartBD) {
+    const uint64_t ReservedSpace = EndBD->getAddress() - StartBD->getAddress();
+    const uint64_t AllocatedSize = NextAvailableAddress - StartBD->getAddress();
+    if (ReservedSpace < AllocatedSize) {
+      BC->errs() << "BOLT-ERROR: reserved space (" << ReservedSpace << " byte"
+                 << (ReservedSpace == 1 ? "" : "s")
+                 << ") is smaller than required for new allocations ("
+                 << AllocatedSize << " bytes)\n";
+      exit(1);
+    }
+  }
 }
 
 std::vector<BinarySection *> RewriteInstance::getCodeSections() {
@@ -3877,7 +3909,7 @@ void RewriteInstance::mapCodeSections(BOLTLinker::SectionMapper MapSection) {
   // Add the new text section aggregating all existing code sections.
   // This is pseudo-section that serves a purpose of creating a corresponding
   // entry in section header table.
-  int64_t NewTextSectionSize =
+  const uint64_t NewTextSectionSize =
       NextAvailableAddress - NewTextSectionStartAddress;
   if (NewTextSectionSize) {
     const unsigned Flags = BinarySection::getFlags(/*IsReadOnly=*/true,
@@ -3960,7 +3992,7 @@ void RewriteInstance::mapAllocatableSections(
       if (PHDRTableAddress) {
         // Segment size includes the size of the PHDR area.
         NewTextSegmentSize = NextAvailableAddress - PHDRTableAddress;
-      } else {
+      } else if (NewTextSegmentAddress) {
         // Existing PHDR table would be updated.
         NewTextSegmentSize = NextAvailableAddress - NewTextSegmentAddress;
       }
@@ -3999,7 +4031,7 @@ void RewriteInstance::patchELFPHDRTable() {
     assert(!PHDRTableAddress && "unexpected address for program header table");
     PHDRTableOffset = Obj.getHeader().e_phoff;
     if (NewWritableSegmentSize) {
-      BC->errs() << "Unable to add writable segment with UseGnuStack option\n";
+      BC->errs() << "BOLT-ERROR: unable to add writable segment\n";
       exit(1);
     }
   }
@@ -4009,7 +4041,7 @@ void RewriteInstance::patchELFPHDRTable() {
   if (!NewWritableSegmentSize) {
     if (PHDRTableAddress)
       NewTextSegmentSize = NextAvailableAddress - PHDRTableAddress;
-    else
+    else if (NewTextSegmentAddress)
       NewTextSegmentSize = NextAvailableAddress - NewTextSegmentAddress;
   } else {
     NewWritableSegmentSize = NextAvailableAddress - NewWritableSegmentAddress;
@@ -4043,8 +4075,10 @@ void RewriteInstance::patchELFPHDRTable() {
   };
 
   auto writeNewSegmentPhdrs = [&]() {
-    ELF64LE::Phdr NewTextPhdr = createNewTextPhdr();
-    OS.write(reinterpret_cast<const char *>(&NewTextPhdr), sizeof(NewTextPhdr));
+    if (PHDRTableAddress || NewTextSegmentSize) {
+      ELF64LE::Phdr NewPhdr = createNewTextPhdr();
+      OS.write(reinterpret_cast<const char *>(&NewPhdr), sizeof(NewPhdr));
+    }
 
     if (NewWritableSegmentSize) {
       ELF64LEPhdrTy NewPhdr;
@@ -4142,9 +4176,8 @@ void RewriteInstance::rewriteNoteSections() {
   const ELFFile<ELF64LE> &Obj = ELF64LEFile->getELFFile();
   raw_fd_ostream &OS = Out->os();
 
-  uint64_t NextAvailableOffset = getFileOffsetForAddress(NextAvailableAddress);
-  assert(NextAvailableOffset >= FirstNonAllocatableOffset &&
-         "next available offset calculation failure");
+  uint64_t NextAvailableOffset = std::max(
+      getFileOffsetForAddress(NextAvailableAddress), FirstNonAllocatableOffset);
   OS.seek(NextAvailableOffset);
 
   // Copy over non-allocatable section contents and update file offsets.
@@ -4883,7 +4916,7 @@ void RewriteInstance::updateELFSymbolTable(
       ++NumHotDataSymsUpdated;
     }
 
-    if (*SymbolName == "_end")
+    if (*SymbolName == "_end" && NextAvailableAddress > Symbol.st_value)
       updateSymbolValue(*SymbolName, NextAvailableAddress);
 
     if (IsDynSym)
@@ -4997,13 +5030,6 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile<ELFT> *File) {
   std::vector<uint32_t> NewSectionIndex;
   getOutputSections(File, NewSectionIndex);
 
-  // Set pointer at the end of the output file, so we can pwrite old symbol
-  // tables if we need to.
-  uint64_t NextAvailableOffset = getFileOffsetForAddress(NextAvailableAddress);
-  assert(NextAvailableOffset >= FirstNonAllocatableOffset &&
-         "next available offset calculation failure");
-  Out->os().seek(NextAvailableOffset);
-
   // Update dynamic symbol table.
   const ELFShdrTy *DynSymSection = nullptr;
   for (const ELFShdrTy &Section : cantFail(Obj.sections())) {
@@ -5015,6 +5041,10 @@ void RewriteInstance::patchELFSymTabs(ELFObjectFile<ELFT> *File) {
   assert((DynSymSection || BC->IsStaticExecutable) &&
          "dynamic symbol table expected");
   if (DynSymSection) {
+    // Set pointer to the end of the section, so we can use pwrite to update
+    // the dynamic symbol table.
+    Out->os().seek(DynSymSection->sh_offset + DynSymSection->sh_size);
+
     updateELFSymbolTable(
         File,
         /*IsDynSym=*/true,
@@ -5568,10 +5598,10 @@ void RewriteInstance::rewriteFile() {
   auto Streamer = BC->createStreamer(OS);
   // Make sure output stream has enough reserved space, otherwise
   // pwrite() will fail.
-  uint64_t Offset = OS.seek(getFileOffsetForAddress(NextAvailableAddress));
-  (void)Offset;
-  assert(Offset == getFileOffsetForAddress(NextAvailableAddress) &&
-         "error resizing output file");
+  uint64_t Offset = std::max(getFileOffsetForAddress(NextAvailableAddress),
+                             FirstNonAllocatableOffset);
+  Offset = OS.seek(Offset);
+  assert((Offset != (uint64_t)-1) && "Error resizing output file");
 
   // Overwrite functions with fixed output address. This is mostly used by
   // non-relocation mode, with one exception: injected functions are covered
@@ -5803,7 +5833,7 @@ void RewriteInstance::writeEHFrameHeader() {
   std::vector<char> NewEHFrameHdr = CFIRdWrt->generateEHFrameHeader(
       RelocatedEHFrame, NewEHFrame, EHFrameHdrOutputAddress, FailedAddresses);
 
-  assert(Out->os().tell() == EHFrameHdrFileOffset && "offset mismatch");
+  Out->os().seek(EHFrameHdrFileOffset);
   Out->os().write(NewEHFrameHdr.data(), NewEHFrameHdr.size());
 
   const unsigned Flags = BinarySection::getFlags(/*IsReadOnly=*/true,
@@ -5823,6 +5853,15 @@ void RewriteInstance::writeEHFrameHeader() {
 
   NextAvailableAddress += EHFrameHdrSec.getOutputSize();
 
+  if (const BinaryData *ReservedEnd =
+          BC->getBinaryDataByName(getBOLTReservedEnd())) {
+    if (NextAvailableAddress > ReservedEnd->getAddress()) {
+      BC->errs() << "BOLT-ERROR: unable to fit " << getEHFrameHdrSectionName()
+                 << " into reserved space\n";
+      exit(1);
+    }
+  }
+
   // Merge new .eh_frame with the relocated original so that gdb can locate all
   // FDEs.
   if (RelocatedEHFrameSection) {
diff --git a/bolt/test/X86/bolt-address-translation.test b/bolt/test/X86/bolt-address-translation.test
index 63234b4c1d21..e6b21c14077b 100644
--- a/bolt/test/X86/bolt-address-translation.test
+++ b/bolt/test/X86/bolt-address-translation.test
@@ -37,7 +37,7 @@
 # CHECK:      BOLT: 3 out of 7 functions were overwritten.
 # CHECK:      BOLT-INFO: Wrote 6 BAT maps
 # CHECK:      BOLT-INFO: Wrote 3 function and 58 basic block hashes
-# CHECK:      BOLT-INFO: BAT section size (bytes): 924
+# CHECK:      BOLT-INFO: BAT section size (bytes): 928
 #
 # usqrt mappings (hot part). We match against any key (left side containing
 # the bolted binary offsets) because BOLT may change where it puts instructions
diff --git a/bolt/test/X86/cdsplit-symbol-names.s b/bolt/test/X86/cdsplit-symbol-names.s
index 1d3fa91936af..e53863e22246 100644
--- a/bolt/test/X86/cdsplit-symbol-names.s
+++ b/bolt/test/X86/cdsplit-symbol-names.s
@@ -7,7 +7,7 @@
 # RUN: llvm-strip --strip-unneeded %t.o
 # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
 # RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=cdsplit \
-# RUN:   --call-scale=2 --data=%t.fdata --reorder-blocks=ext-tsp --enable-bat
+# RUN:         --call-scale=2 --data=%t.fdata --reorder-blocks=ext-tsp
 # RUN: llvm-objdump --syms %t.bolt | FileCheck %s --check-prefix=CHECK-SYMS-WARM
 
 # CHECK-SYMS-WARM: 0000000000000000 l df *ABS* 0000000000000000 bolt-pseudo.o
@@ -16,19 +16,8 @@
 # CHECK-SYMS-WARM: .text.cold
 # CHECK-SYMS-WARM-SAME: dummy.cold
 
-# RUN: link_fdata %s %t.bolt %t.preagg PREAGG
-# PREAGG: B X:0 #chain.warm# 1 0
-# RUN: perf2bolt %t.bolt -p %t.preagg --pa -o %t.bat.fdata -w %t.bat.yaml -v=1 \
-# RUN:   | FileCheck %s --check-prefix=CHECK-REGISTER
-
-# CHECK-REGISTER: BOLT-INFO: marking chain.warm/1(*2) as a fragment of chain/2(*2)
-
         .text
-        .type   chain, @function
-chain:
-        ret
-        .size   chain, .-chain
-
+        .globl  chain
         .type   chain, @function
 chain:
         pushq   %rbp
diff --git a/bolt/test/X86/register-fragments-bolt-symbols.s b/bolt/test/X86/register-fragments-bolt-symbols.s
new file mode 100644
index 000000000000..fa9b70e0b2d8
--- /dev/null
+++ b/bolt/test/X86/register-fragments-bolt-symbols.s
@@ -0,0 +1,32 @@
+# Test the heuristics for matching BOLT-added split functions.
+
+# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %S/cdsplit-symbol-names.s -o %t.main.o
+# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.chain.o
+# RUN: link_fdata %S/cdsplit-symbol-names.s %t.main.o %t.fdata
+# RUN: sed -i 's|chain|chain/2|g' %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.main.o
+# RUN: llvm-objcopy --localize-symbol=chain %t.main.o
+# RUN: %clang %cflags %t.chain.o %t.main.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=randomN \
+# RUN:   --reorder-blocks=ext-tsp --enable-bat --bolt-seed=7 --data=%t.fdata
+# RUN: llvm-objdump --syms %t.bolt | FileCheck %s --check-prefix=CHECK-SYMS
+
+# RUN: link_fdata %s %t.bolt %t.preagg PREAGG
+# PREAGG: B X:0 #chain.cold.0# 1 0
+# RUN: perf2bolt %t.bolt -p %t.preagg --pa -o %t.bat.fdata -w %t.bat.yaml -v=1 \
+# RUN:   | FileCheck %s --check-prefix=CHECK-REGISTER
+
+# CHECK-SYMS: l df *ABS*          [[#]] chain.s
+# CHECK-SYMS: l  F .bolt.org.text [[#]] chain
+# CHECK-SYMS: l  F .text.cold     [[#]] chain.cold.0
+# CHECK-SYMS: l  F .text          [[#]] chain
+# CHECK-SYMS: l df *ABS*          [[#]] bolt-pseudo.o
+
+# CHECK-REGISTER: BOLT-INFO: marking chain.cold.0/1(*2) as a fragment of chain/2(*2)
+
+.file "chain.s"
+        .text
+        .type   chain, @function
+chain:
+        ret
+        .size   chain, .-chain
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 127d1b6dd482..c2e90f4e7d58 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -711,6 +711,8 @@ even-odd element pair with indices ``i * 2`` and ``i * 2 + 1`` with
 power of 2, the vector is widened with neutral elements for the reduction
 at the end to the next power of 2.
 
+These reductions support both fixed-sized and scalable vector types.
+
 Example:
 
 .. code-block:: c++
@@ -2929,7 +2931,7 @@ Query for this feature with ``__has_builtin(__builtin_dump_struct)``
 ``__builtin_shufflevector`` is used to express generic vector
 permutation/shuffle/swizzle operations.  This builtin is also very important
 for the implementation of various target-specific header files like
-``<xmmintrin.h>``.
+``<xmmintrin.h>``. This builtin can be used within constant expressions.
 
 **Syntax**:
 
@@ -2956,7 +2958,7 @@ for the implementation of various target-specific header files like
   // Concatenate every other element of 8-element vectors V1 and V2.
   __builtin_shufflevector(V1, V2, 0, 2, 4, 6, 8, 10, 12, 14)
 
-  // Shuffle v1 with some elements being undefined
+  // Shuffle v1 with some elements being undefined. Not allowed in constexpr.
   __builtin_shufflevector(v1, v1, 3, -1, 1, -1)
 
 **Description**:
@@ -2969,6 +2971,7 @@ starting with the first vector, continuing into the second vector.  Thus, if
 ``vec1`` is a 4-element vector, index 5 would refer to the second element of
 ``vec2``. An index of -1 can be used to indicate that the corresponding element
 in the returned vector is a don't care and can be optimized by the backend.
+Values of -1 are not supported in constant expressions.
 
 The result of ``__builtin_shufflevector`` is a vector with the same element
 type as ``vec1``/``vec2`` but that has an element count equal to the number of
@@ -2983,7 +2986,8 @@ Query for this feature with ``__has_builtin(__builtin_shufflevector)``.
 
 ``__builtin_convertvector`` is used to express generic vector
 type-conversion operations. The input vector and the output vector
-type must have the same number of elements.
+type must have the same number of elements. This builtin can be used within
+constant expressions.
 
 **Syntax**:
 
diff --git a/clang/docs/LibTooling.rst b/clang/docs/LibTooling.rst
index df50dcebf9b8..87d84321ab28 100644
--- a/clang/docs/LibTooling.rst
+++ b/clang/docs/LibTooling.rst
@@ -63,15 +63,22 @@ and automatic location of the compilation database using source files paths.
   #include "llvm/Support/CommandLine.h"
 
   using namespace clang::tooling;
+  using namespace llvm;
 
   // Apply a custom category to all command-line options so that they are the
   // only ones displayed.
-  static llvm::cl::OptionCategory MyToolCategory("my-tool options");
+  static cl::OptionCategory MyToolCategory("my-tool options");
 
   int main(int argc, const char **argv) {
-    // CommonOptionsParser constructor will parse arguments and create a
-    // CompilationDatabase.  In case of error it will terminate the program.
-    CommonOptionsParser OptionsParser(argc, argv, MyToolCategory);
+    // CommonOptionsParser::create will parse arguments and create a
+    // CompilationDatabase.
+    auto ExpectedParser = CommonOptionsParser::create(argc, argv, MyToolCategory);
+    if (!ExpectedParser) {
+      // Fail gracefully for unsupported options.
+      llvm::errs() << ExpectedParser.takeError();
+      return 1;
+    }
+    CommonOptionsParser& OptionsParser = ExpectedParser.get();
 
     // Use OptionsParser.getCompilations() and OptionsParser.getSourcePathList()
     // to retrieve CompilationDatabase and the list of input file paths.
@@ -133,7 +140,12 @@ version of this example tool is also checked into the clang tree at
   static cl::extrahelp MoreHelp("\nMore help text...\n");
 
   int main(int argc, const char **argv) {
-    CommonOptionsParser OptionsParser(argc, argv, MyToolCategory);
+    auto ExpectedParser = CommonOptionsParser::create(argc, argv, MyToolCategory);
+    if (!ExpectedParser) {
+      llvm::errs() << ExpectedParser.takeError();
+      return 1;
+    }
+    CommonOptionsParser& OptionsParser = ExpectedParser.get();
     ClangTool Tool(OptionsParser.getCompilations(),
                    OptionsParser.getSourcePathList());
     return Tool.run(newFrontendActionFactory<clang::SyntaxOnlyAction>().get());
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 64a523a6f25f..676f38a8e94c 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -63,6 +63,12 @@ ABI Changes in This Version
   MSVC uses a different mangling for these objects, compatibility is not affected.
   (#GH85423).
 
+- Fixed Microsoft calling convention for returning certain classes with a
+  templated constructor. If a class has a templated constructor, it should
+  be returned indirectly even if it meets all the other requirements for
+  returning a class in a register. This affects some uses of std::pair.
+  (#GH86384).
+
 AST Dumping Potentially Breaking Changes
 ----------------------------------------
 
@@ -145,6 +151,7 @@ C++2c Feature Support
 
 - Implemented `P0609R3: Attributes for Structured Bindings <https://wg21.link/P0609R3>`_
 
+- Implemented `P2748R5 Disallow Binding a Returned Glvalue to a Temporary <https://wg21.link/P2748R5>`_.
 
 Resolutions to C++ Defect Reports
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -221,6 +228,7 @@ Non-comprehensive list of changes in this release
 - ``__typeof_unqual__`` is available in all C modes as an extension, which behaves
   like ``typeof_unqual`` from C23, similar to ``__typeof__`` and ``typeof``.
 
+- ``__builtin_reduce_{add|mul|xor|or|and|min|max}`` builtins now support scalable vectors.
 
 * Shared libraries linked with either the ``-ffast-math``, ``-Ofast``, or
   ``-funsafe-math-optimizations`` flags will no longer enable flush-to-zero
@@ -231,6 +239,9 @@ Non-comprehensive list of changes in this release
 * ``-fdenormal-fp-math=preserve-sign`` is no longer implied by ``-ffast-math``
   on x86 systems.
 
+- Builtins ``__builtin_shufflevector()`` and ``__builtin_convertvector()`` may
+  now be used within constant expressions.
+
 New Compiler Flags
 ------------------
 - ``-fsanitize=implicit-bitfield-conversion`` checks implicit truncation and
@@ -460,6 +471,10 @@ Bug Fixes in This Version
 
 - Fixed an assertion failure on invalid InitListExpr in C89 mode (#GH88008).
 
+- Fixed missing destructor calls when we branch from middle of an expression.
+  This could happen through a branch in stmt-expr or in an expression containing a coroutine
+  suspension. Fixes (#GH63818) (#GH88478).
+
 - Clang will no longer diagnose an erroneous non-dependent ``switch`` condition
   during instantiation, and instead will only diagnose it once, during checking
   of the function template.
@@ -590,6 +605,8 @@ Bug Fixes to C++ Support
 - Fixed a use-after-free bug in parsing of type constraints with default arguments that involve lambdas. (#GH67235)
 - Fixed bug in which the body of a consteval lambda within a template was not parsed as within an
   immediate function context.
+- Fix CTAD for ``std::initializer_list``. This allows ``std::initializer_list{1, 2, 3}`` to be deduced as
+  ``std::initializer_list<int>`` as intended.
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -701,6 +718,11 @@ AIX Support
 WebAssembly Support
 ^^^^^^^^^^^^^^^^^^^
 
+The -mcpu=generic configuration now enables multivalue and reference-types.These
+proposals are standardized and available in all major engines. Enabling
+multivalue here only enables the language feature but does not turn on the
+multivalue ABI (this enables non-ABI uses of multivalue, like exnref).
+
 AVR Support
 ^^^^^^^^^^^
 
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index d0326f01d251..6ba0a531fedf 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -1452,8 +1452,6 @@ floating point semantic models: precise (the default), strict, and fast.
   "fenv_access", "off", "on", "off"
   "rounding_mode", "tonearest", "dynamic", "tonearest"
   "contract", "on", "off", "fast"
-  "denormal_fp_math", "IEEE", "IEEE", "IEEE"
-  "denormal_fp32_math", "IEEE","IEEE", "IEEE"
   "support_math_errno", "on", "on", "off"
   "no_honor_nans", "off", "off", "on"
   "no_honor_infinities", "off", "off", "on"
@@ -1462,6 +1460,14 @@ floating point semantic models: precise (the default), strict, and fast.
   "allow_approximate_fns", "off", "off", "on"
   "allow_reassociation", "off", "off", "on"
 
+The ``-ffp-model`` option does not modify the ``fdenormal-fp-math``
+setting, but it does have an impact on whether ``crtfastmath.o`` is
+linked. Because linking ``crtfastmath.o`` has a global effect on the
+program, and because the global denormal handling can be changed in
+other ways, the state of ``fdenormal-fp-math`` handling cannot
+be assumed in any function based on fp-model. See :ref:`crtfastmath.o`
+for more details.
+
 .. option:: -ffast-math
 
    Enable fast-math mode.  This option lets the
@@ -1537,7 +1543,6 @@ floating point semantic models: precise (the default), strict, and fast.
    Also, this option resets following options to their target-dependent defaults.
 
    * ``-f[no-]math-errno``
-   * ``-fdenormal-fp-math=<value>``
 
    There is ambiguity about how ``-ffp-contract``, ``-ffast-math``,
    and ``-fno-fast-math`` behave when combined. To keep the value of
@@ -1560,8 +1565,7 @@ floating point semantic models: precise (the default), strict, and fast.
      ``-ffp-contract`` setting is determined by the default value of
      ``-ffp-contract``.
 
-   Note: ``-fno-fast-math`` implies ``-fdenormal-fp-math=ieee``.
-   ``-fno-fast-math`` causes ``crtfastmath.o`` to not be linked with code
+   Note: ``-fno-fast-math`` causes ``crtfastmath.o`` to not be linked with code
    unless ``-mdaz-ftz`` is present.
 
 .. option:: -fdenormal-fp-math=<value>
@@ -1694,7 +1698,6 @@ floating point semantic models: precise (the default), strict, and fast.
    * ``-fsigned-zeros``
    * ``-ftrapping-math``
    * ``-ffp-contract=on``
-   * ``-fdenormal-fp-math=ieee``
 
    There is ambiguity about how ``-ffp-contract``,
    ``-funsafe-math-optimizations``, and ``-fno-unsafe-math-optimizations``
@@ -2319,6 +2322,8 @@ are listed below.
    on ELF targets when using the integrated assembler. This flag currently
    only has an effect on ELF targets.
 
+.. _funique_internal_linkage_names:
+
 .. option:: -f[no]-unique-internal-linkage-names
 
    Controls whether Clang emits a unique (best-effort) symbol name for internal
@@ -2448,27 +2453,41 @@ usual build cycle when using sample profilers for optimization:
    usual build flags that you always build your application with. The only
    requirement is that DWARF debug info including source line information is
    generated. This DWARF information is important for the profiler to be able
-   to map instructions back to source line locations.
+   to map instructions back to source line locations. The usefulness of this
+   DWARF information can be improved with the ``-fdebug-info-for-profiling``
+   and ``-funique-internal-linkage-names`` options.
 
-   On Linux, ``-g`` or just ``-gline-tables-only`` is sufficient:
+   On Linux:
 
    .. code-block:: console
 
-     $ clang++ -O2 -gline-tables-only code.cc -o code
+     $ clang++ -O2 -gline-tables-only \
+       -fdebug-info-for-profiling -funique-internal-linkage-names \
+       code.cc -o code
 
    While MSVC-style targets default to CodeView debug information, DWARF debug
    information is required to generate source-level LLVM profiles. Use
    ``-gdwarf`` to include DWARF debug information:
 
-   .. code-block:: console
+   .. code-block:: winbatch
+
+     > clang-cl /O2 -gdwarf -gline-tables-only ^
+       /clang:-fdebug-info-for-profiling /clang:-funique-internal-linkage-names ^
+       code.cc /Fe:code /fuse-ld=lld /link /debug:dwarf
 
-     $ clang-cl -O2 -gdwarf -gline-tables-only coff-profile.cpp -fuse-ld=lld -link -debug:dwarf
+.. note::
+
+   :ref:`-funique-internal-linkage-names <funique_internal_linkage_names>`
+   generates unique names based on given command-line source file paths. If
+   your build system uses absolute source paths and these paths may change
+   between steps 1 and 4, then the uniqued function names may change and result
+   in unused profile data. Consider omitting this option in such cases.
 
 2. Run the executable under a sampling profiler. The specific profiler
    you use does not really matter, as long as its output can be converted
    into the format that the LLVM optimizer understands.
 
-   Two such profilers are the the Linux Perf profiler
+   Two such profilers are the Linux Perf profiler
    (https://perf.wiki.kernel.org/) and Intel's Sampling Enabling Product (SEP),
    available as part of `Intel VTune
    <https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/vtune-profiler.html>`_.
@@ -2482,7 +2501,9 @@ usual build cycle when using sample profilers for optimization:
 
    .. code-block:: console
 
-     $ perf record -b ./code
+     $ perf record -b -e BR_INST_RETIRED.NEAR_TAKEN:uppp ./code
+
+   If the event above is unavailable, ``branches:u`` is probably next-best.
 
    Note the use of the ``-b`` flag. This tells Perf to use the Last Branch
    Record (LBR) to record call chains. While this is not strictly required,
@@ -2532,21 +2553,42 @@ usual build cycle when using sample profilers for optimization:
    that executes faster than the original one. Note that you are not
    required to build the code with the exact same arguments that you
    used in the first step. The only requirement is that you build the code
-   with ``-gline-tables-only`` and ``-fprofile-sample-use``.
+   with the same debug info options and ``-fprofile-sample-use``.
+
+   On Linux:
 
    .. code-block:: console
 
-     $ clang++ -O2 -gline-tables-only -fprofile-sample-use=code.prof code.cc -o code
+     $ clang++ -O2 -gline-tables-only \
+       -fdebug-info-for-profiling -funique-internal-linkage-names \
+       -fprofile-sample-use=code.prof code.cc -o code
 
-  [OPTIONAL] Sampling-based profiles can have inaccuracies or missing block/
-  edge counters. The profile inference algorithm (profi) can be used to infer
-  missing blocks and edge counts, and improve the quality of profile data.
-  Enable it with ``-fsample-profile-use-profi``.
+   On Windows:
 
-  .. code-block:: console
+   .. code-block:: winbatch
+
+     > clang-cl /O2 -gdwarf -gline-tables-only ^
+       /clang:-fdebug-info-for-profiling /clang:-funique-internal-linkage-names ^
+       /fprofile-sample-use=code.prof code.cc /Fe:code /fuse-ld=lld /link /debug:dwarf
+
+   [OPTIONAL] Sampling-based profiles can have inaccuracies or missing block/
+   edge counters. The profile inference algorithm (profi) can be used to infer
+   missing blocks and edge counts, and improve the quality of profile data.
+   Enable it with ``-fsample-profile-use-profi``. For example, on Linux:
+
+   .. code-block:: console
+
+     $ clang++ -fsample-profile-use-profi -O2 -gline-tables-only \
+       -fdebug-info-for-profiling -funique-internal-linkage-names \
+       -fprofile-sample-use=code.prof code.cc -o code
+
+   On Windows:
+
+   .. code-block:: winbatch
 
-    $ clang++ -O2 -gline-tables-only -fprofile-sample-use=code.prof \
-      -fsample-profile-use-profi code.cc -o code
+     > clang-cl /clang:-fsample-profile-use-profi /O2 -gdwarf -gline-tables-only ^
+       /clang:-fdebug-info-for-profiling /clang:-funique-internal-linkage-names ^
+       /fprofile-sample-use=code.prof code.cc /Fe:code /fuse-ld=lld /link /debug:dwarf
 
 Sample Profile Formats
 """"""""""""""""""""""
diff --git a/clang/include/clang/AST/ASTNodeTraverser.h b/clang/include/clang/AST/ASTNodeTraverser.h
index 216dc9eef08b..bf7c204e4ad7 100644
--- a/clang/include/clang/AST/ASTNodeTraverser.h
+++ b/clang/include/clang/AST/ASTNodeTraverser.h
@@ -844,6 +844,12 @@ public:
     }
   }
 
+  void VisitUnresolvedLookupExpr(const UnresolvedLookupExpr *E) {
+    if (E->hasExplicitTemplateArgs())
+      for (auto Arg : E->template_arguments())
+        Visit(Arg.getArgument());
+  }
+
   void VisitRequiresExpr(const RequiresExpr *E) {
     for (auto *D : E->getLocalParameters())
       Visit(D);
diff --git a/clang/include/clang/AST/NestedNameSpecifier.h b/clang/include/clang/AST/NestedNameSpecifier.h
index 7b0c21b9e7cf..a1d9e30e660d 100644
--- a/clang/include/clang/AST/NestedNameSpecifier.h
+++ b/clang/include/clang/AST/NestedNameSpecifier.h
@@ -266,7 +266,7 @@ public:
   explicit operator bool() const { return Qualifier; }
 
   /// Evaluates true when this nested-name-specifier location is
-  /// empty.
+  /// non-empty.
   bool hasQualifier() const { return Qualifier; }
 
   /// Retrieve the nested-name-specifier to which this instance
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index dff02d4861b3..e6643469e0b3 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -480,7 +480,7 @@ public:
   }
   void removeCVRQualifiers(unsigned mask) {
     assert(!(mask & ~CVRMask) && "bitmask contains non-CVR bits");
-    Mask &= ~mask;
+    Mask &= ~static_cast<uint64_t>(mask);
   }
   void removeCVRQualifiers() {
     removeCVRQualifiers(CVRMask);
@@ -609,7 +609,7 @@ public:
   }
   void removeFastQualifiers(unsigned mask) {
     assert(!(mask & ~FastMask) && "bitmask contains non-fast qualifier bits");
-    Mask &= ~mask;
+    Mask &= ~static_cast<uint64_t>(mask);
   }
   void removeFastQualifiers() {
     removeFastQualifiers(FastMask);
@@ -2378,6 +2378,10 @@ public:
   /// 'riscv_rvv_vector_bits' type attribute as VectorType.
   QualType getRVVEltType(const ASTContext &Ctx) const;
 
+  /// Returns the representative type for the element of a sizeless vector
+  /// builtin type.
+  QualType getSizelessVectorEltType(const ASTContext &Ctx) const;
+
   /// Types are partitioned into 3 broad categories (C99 6.2.5p1):
   /// object types, function types, and incomplete types.
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index fdca82934cb4..f72d5c252b86 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9901,6 +9901,9 @@ def warn_format_invalid_annotation : Warning<
 def warn_format_P_no_precision : Warning<
   "using '%%P' format specifier without precision">,
   InGroup<Format>;
+def warn_format_P_with_objc_pointer : Warning<
+  "using '%%P' format specifier with an Objective-C pointer results in dumping runtime object structure, not object value">,
+  InGroup<Format>;
 def warn_printf_ignored_flag: Warning<
   "flag '%0' is ignored when flag '%1' is present">,
   InGroup<Format>;
@@ -9950,6 +9953,8 @@ def warn_ret_stack_addr_ref : Warning<
 def warn_ret_local_temp_addr_ref : Warning<
   "returning %select{address of|reference to}0 local temporary object">,
   InGroup<ReturnStackAddress>;
+def err_ret_local_temp_ref : Error<
+  "returning reference to local temporary object">;
 def warn_ret_addr_label : Warning<
   "returning address of label, which is local">,
   InGroup<ReturnStackAddress>;
@@ -10328,9 +10333,13 @@ def err_shufflevector_nonconstant_argument : Error<
 def err_shufflevector_argument_too_large : Error<
   "index for __builtin_shufflevector must be less than the total number "
   "of vector elements">;
+def err_shufflevector_minus_one_is_undefined_behavior_constexpr : Error<
+  "index for __builtin_shufflevector not within the bounds of the input vectors; index of -1 found at position %0 not permitted in a constexpr context.">;
 
 def err_convertvector_non_vector : Error<
   "first argument to __builtin_convertvector must be a vector">;
+def err_convertvector_constexpr_unsupported_vector_cast : Error<
+  "unsupported vector cast from %0 to %1 in a constant expression.">;
 def err_builtin_non_vector_type : Error<
   "%0 argument to %1 must be of vector type">;
 def err_convertvector_incompatible_vector : Error<
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 6d655c39360d..6390ba3f9fe5 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -275,7 +275,7 @@ def OP_VCVT_BF16_F32_HI_A32
                            (call "vget_low", $p0))>;
 
 def OP_CVT_F32_BF16
-    : Op<(bitcast "R", (op "<<", (bitcast "int32_t", $p0),
+    : Op<(bitcast "R", (op "<<", (cast "int32_t", (bitcast "int16_t", $p0)),
                                  (literal "int32_t", "16")))>;
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 086aedefc118..25f479dccc3c 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2886,6 +2886,17 @@ def flax_vector_conversions : Flag<["-"], "flax-vector-conversions">, Group<f_Gr
 def flimited_precision_EQ : Joined<["-"], "flimited-precision=">, Group<f_Group>;
 def fapple_link_rtlib : Flag<["-"], "fapple-link-rtlib">, Group<f_Group>,
   HelpText<"Force linking the clang builtins runtime library">;
+
+/// ClangIR-specific options - BEGIN
+defm clangir : BoolFOption<"clangir",
+  FrontendOpts<"UseClangIRPipeline">, DefaultFalse,
+  PosFlag<SetTrue, [], [ClangOption, CC1Option], "Use the ClangIR pipeline to compile">,
+  NegFlag<SetFalse, [], [ClangOption, CC1Option], "Use the AST -> LLVM pipeline to compile">,
+  BothFlags<[], [ClangOption, CC1Option], "">>;
+def emit_cir : Flag<["-"], "emit-cir">, Visibility<[CC1Option]>,
+  Group<Action_Group>, HelpText<"Build ASTs and then lower to ClangIR">;
+/// ClangIR-specific options - END
+
 def flto_EQ : Joined<["-"], "flto=">,
   Visibility<[ClangOption, CLOption, CC1Option, FC1Option, FlangOption]>,
   Group<f_Group>,
@@ -6589,12 +6600,6 @@ def J : JoinedOrSeparate<["-"], "J">,
   Group<gfortran_Group>,
   Alias<module_dir>;
 
-let Visibility = [FlangOption] in {
-def no_fortran_main : Flag<["-"], "fno-fortran-main">,
-  Visibility<[FlangOption]>, Group<f_Group>,
-  HelpText<"Do not include Fortran_main.a (provided by Flang) when linking">;
-} // let Visibility = [ FlangOption ]
-
 //===----------------------------------------------------------------------===//
 // FC1 Options
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Frontend/FrontendOptions.h b/clang/include/clang/Frontend/FrontendOptions.h
index a738c1f37576..bd4981ca0ac0 100644
--- a/clang/include/clang/Frontend/FrontendOptions.h
+++ b/clang/include/clang/Frontend/FrontendOptions.h
@@ -65,6 +65,9 @@ enum ActionKind {
   /// Translate input source into HTML.
   EmitHTML,
 
+  /// Emit a .cir file
+  EmitCIR,
+
   /// Emit a .ll file.
   EmitLLVM,
 
@@ -408,6 +411,10 @@ public:
   LLVM_PREFERRED_TYPE(bool)
   unsigned GenReducedBMI : 1;
 
+  /// Use Clang IR pipeline to emit code
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned UseClangIRPipeline : 1;
+
   CodeCompleteOptions CodeCompleteOpts;
 
   /// Specifies the output format of the AST.
@@ -590,7 +597,7 @@ public:
         EmitSymbolGraph(false), EmitExtensionSymbolGraphs(false),
         EmitSymbolGraphSymbolLabelsForTesting(false),
         EmitPrettySymbolGraphs(false), GenReducedBMI(false),
-        TimeTraceGranularity(500) {}
+        UseClangIRPipeline(false), TimeTraceGranularity(500) {}
 
   /// getInputKindForExtension - Return the appropriate input kind for a file
   /// extension. For example, "c" would return Language::C.
diff --git a/clang/include/clang/Sema/DeclSpec.h b/clang/include/clang/Sema/DeclSpec.h
index 760c7980be52..23bc780e0497 100644
--- a/clang/include/clang/Sema/DeclSpec.h
+++ b/clang/include/clang/Sema/DeclSpec.h
@@ -1811,15 +1811,15 @@ public:
       : Bindings(nullptr), NumBindings(0), DeleteBindings(false) {}
   DecompositionDeclarator(const DecompositionDeclarator &G) = delete;
   DecompositionDeclarator &operator=(const DecompositionDeclarator &G) = delete;
-  ~DecompositionDeclarator() {
-    if (DeleteBindings)
-      delete[] Bindings;
-  }
+  ~DecompositionDeclarator() { clear(); }
 
   void clear() {
     LSquareLoc = RSquareLoc = SourceLocation();
     if (DeleteBindings)
       delete[] Bindings;
+    else
+      llvm::for_each(llvm::MutableArrayRef(Bindings, NumBindings),
+                     [](Binding &B) { B.Attrs.reset(); });
     Bindings = nullptr;
     NumBindings = 0;
     DeleteBindings = false;
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index f341c74cf86e..03e1055251c2 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -1115,7 +1115,7 @@ int64_t Decl::getID() const {
 
 const FunctionType *Decl::getFunctionType(bool BlocksToo) const {
   QualType Ty;
-  if (const auto *D = dyn_cast<BindingDecl>(this))
+  if (isa<BindingDecl>(this))
     return nullptr;
   else if (const auto *D = dyn_cast<ValueDecl>(this))
     Ty = D->getType();
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index ea3e7304a742..f1aa19e4409e 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -2706,7 +2706,11 @@ static bool checkFloatingPointResult(EvalInfo &Info, const Expr *E,
 static bool HandleFloatToFloatCast(EvalInfo &Info, const Expr *E,
                                    QualType SrcType, QualType DestType,
                                    APFloat &Result) {
-  assert(isa<CastExpr>(E) || isa<CompoundAssignOperator>(E));
+  assert((isa<CastExpr>(E) || isa<CompoundAssignOperator>(E) ||
+          isa<ConvertVectorExpr>(E)) &&
+         "HandleFloatToFloatCast has been checked with only CastExpr, "
+         "CompoundAssignOperator and ConvertVectorExpr. Please either validate "
+         "the new expression or address the root cause of this usage.");
   llvm::RoundingMode RM = getActiveRoundingMode(Info, E);
   APFloat::opStatus St;
   APFloat Value = Result;
@@ -9237,9 +9241,10 @@ bool PointerExprEvaluator::VisitCastExpr(const CastExpr *E) {
       bool HasValidResult = !Result.InvalidBase && !Result.Designator.Invalid &&
                             !Result.IsNullPtr;
       bool VoidPtrCastMaybeOK =
-          HasValidResult &&
-          Info.Ctx.hasSameUnqualifiedType(Result.Designator.getType(Info.Ctx),
-                                          E->getType()->getPointeeType());
+          Result.IsNullPtr ||
+          (HasValidResult &&
+           Info.Ctx.hasSimilarType(Result.Designator.getType(Info.Ctx),
+                                   E->getType()->getPointeeType()));
       // 1. We'll allow it in std::allocator::allocate, and anything which that
       //    calls.
       // 2. HACK 2022-03-28: Work around an issue with libstdc++'s
@@ -10709,8 +10714,11 @@ namespace {
     bool VisitUnaryImag(const UnaryOperator *E);
     bool VisitBinaryOperator(const BinaryOperator *E);
     bool VisitUnaryOperator(const UnaryOperator *E);
+    bool VisitConvertVectorExpr(const ConvertVectorExpr *E);
+    bool VisitShuffleVectorExpr(const ShuffleVectorExpr *E);
+
     // FIXME: Missing: conditional operator (for GNU
-    //                 conditional select), shufflevector, ExtVectorElementExpr
+    //                 conditional select), ExtVectorElementExpr
   };
 } // end anonymous namespace
 
@@ -10961,6 +10969,122 @@ bool VectorExprEvaluator::VisitUnaryOperator(const UnaryOperator *E) {
   return Success(APValue(ResultElements.data(), ResultElements.size()), E);
 }
 
+static bool handleVectorElementCast(EvalInfo &Info, const FPOptions FPO,
+                                    const Expr *E, QualType SourceTy,
+                                    QualType DestTy, APValue const &Original,
+                                    APValue &Result) {
+  if (SourceTy->isIntegerType()) {
+    if (DestTy->isRealFloatingType()) {
+      Result = APValue(APFloat(0.0));
+      return HandleIntToFloatCast(Info, E, FPO, SourceTy, Original.getInt(),
+                                  DestTy, Result.getFloat());
+    }
+    if (DestTy->isIntegerType()) {
+      Result = APValue(
+          HandleIntToIntCast(Info, E, DestTy, SourceTy, Original.getInt()));
+      return true;
+    }
+  } else if (SourceTy->isRealFloatingType()) {
+    if (DestTy->isRealFloatingType()) {
+      Result = Original;
+      return HandleFloatToFloatCast(Info, E, SourceTy, DestTy,
+                                    Result.getFloat());
+    }
+    if (DestTy->isIntegerType()) {
+      Result = APValue(APSInt());
+      return HandleFloatToIntCast(Info, E, SourceTy, Original.getFloat(),
+                                  DestTy, Result.getInt());
+    }
+  }
+
+  Info.FFDiag(E, diag::err_convertvector_constexpr_unsupported_vector_cast)
+      << SourceTy << DestTy;
+  return false;
+}
+
+bool VectorExprEvaluator::VisitConvertVectorExpr(const ConvertVectorExpr *E) {
+  APValue Source;
+  QualType SourceVecType = E->getSrcExpr()->getType();
+  if (!EvaluateAsRValue(Info, E->getSrcExpr(), Source))
+    return false;
+
+  QualType DestTy = E->getType()->castAs<VectorType>()->getElementType();
+  QualType SourceTy = SourceVecType->castAs<VectorType>()->getElementType();
+
+  const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
+
+  auto SourceLen = Source.getVectorLength();
+  SmallVector<APValue, 4> ResultElements;
+  ResultElements.reserve(SourceLen);
+  for (unsigned EltNum = 0; EltNum < SourceLen; ++EltNum) {
+    APValue Elt;
+    if (!handleVectorElementCast(Info, FPO, E, SourceTy, DestTy,
+                                 Source.getVectorElt(EltNum), Elt))
+      return false;
+    ResultElements.push_back(std::move(Elt));
+  }
+
+  return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+}
+
+static bool handleVectorShuffle(EvalInfo &Info, const ShuffleVectorExpr *E,
+                                QualType ElemType, APValue const &VecVal1,
+                                APValue const &VecVal2, unsigned EltNum,
+                                APValue &Result) {
+  unsigned const TotalElementsInInputVector1 = VecVal1.getVectorLength();
+  unsigned const TotalElementsInInputVector2 = VecVal2.getVectorLength();
+
+  APSInt IndexVal = E->getShuffleMaskIdx(Info.Ctx, EltNum);
+  int64_t index = IndexVal.getExtValue();
+  // The spec says that -1 should be treated as undef for optimizations,
+  // but in constexpr we'd have to produce an APValue::Indeterminate,
+  // which is prohibited from being a top-level constant value. Emit a
+  // diagnostic instead.
+  if (index == -1) {
+    Info.FFDiag(
+        E, diag::err_shufflevector_minus_one_is_undefined_behavior_constexpr)
+        << EltNum;
+    return false;
+  }
+
+  if (index < 0 ||
+      index >= TotalElementsInInputVector1 + TotalElementsInInputVector2)
+    llvm_unreachable("Out of bounds shuffle index");
+
+  if (index >= TotalElementsInInputVector1)
+    Result = VecVal2.getVectorElt(index - TotalElementsInInputVector1);
+  else
+    Result = VecVal1.getVectorElt(index);
+  return true;
+}
+
+bool VectorExprEvaluator::VisitShuffleVectorExpr(const ShuffleVectorExpr *E) {
+  APValue VecVal1;
+  const Expr *Vec1 = E->getExpr(0);
+  if (!EvaluateAsRValue(Info, Vec1, VecVal1))
+    return false;
+  APValue VecVal2;
+  const Expr *Vec2 = E->getExpr(1);
+  if (!EvaluateAsRValue(Info, Vec2, VecVal2))
+    return false;
+
+  VectorType const *DestVecTy = E->getType()->castAs<VectorType>();
+  QualType DestElTy = DestVecTy->getElementType();
+
+  auto TotalElementsInOutputVector = DestVecTy->getNumElements();
+
+  SmallVector<APValue, 4> ResultElements;
+  ResultElements.reserve(TotalElementsInOutputVector);
+  for (unsigned EltNum = 0; EltNum < TotalElementsInOutputVector; ++EltNum) {
+    APValue Elt;
+    if (!handleVectorShuffle(Info, E, DestElTy, VecVal1, VecVal2, EltNum, Elt))
+      return false;
+    ResultElements.push_back(std::move(Elt));
+  }
+
+  return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+}
+
 //===----------------------------------------------------------------------===//
 // Array Evaluation
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 588ffa55c11e..17f95e7f3cac 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -212,6 +212,13 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
     if (!this->visit(SubExpr))
       return false;
 
+    // If SubExpr doesn't result in a pointer, make it one.
+    if (PrimType FromT = classifyPrim(SubExpr->getType()); FromT != PT_Ptr) {
+      assert(isPtrType(FromT));
+      if (!this->emitDecayPtr(FromT, PT_Ptr, CE))
+        return false;
+    }
+
     PrimType T = classifyPrim(CE->getType());
     if (T == PT_IntAP)
       return this->emitCastPointerIntegralAP(Ctx.getBitWidth(CE->getType()),
@@ -924,8 +931,31 @@ bool ByteCodeExprGen<Emitter>::VisitImplicitValueInitExpr(const ImplicitValueIni
   if (std::optional<PrimType> T = classify(QT))
     return this->visitZeroInitializer(*T, QT, E);
 
-  if (QT->isRecordType())
-    return false;
+  if (QT->isRecordType()) {
+    const RecordDecl *RD = QT->getAsRecordDecl();
+    assert(RD);
+    if (RD->isInvalidDecl())
+      return false;
+    if (RD->isUnion()) {
+      // C++11 [dcl.init]p5: If T is a (possibly cv-qualified) union type, the
+      // object's first non-static named data member is zero-initialized
+      // FIXME
+      return false;
+    }
+
+    if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD);
+        CXXRD && CXXRD->getNumVBases() > 0) {
+      // TODO: Diagnose.
+      return false;
+    }
+
+    const Record *R = getRecord(QT);
+    if (!R)
+      return false;
+
+    assert(Initializing);
+    return this->visitZeroRecordInitializer(R, E);
+  }
 
   if (QT->isIncompleteArrayType())
     return true;
@@ -1000,122 +1030,98 @@ bool ByteCodeExprGen<Emitter>::VisitArraySubscriptExpr(
 
 template <class Emitter>
 bool ByteCodeExprGen<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
+                                             const Expr *ArrayFiller,
                                              const Expr *E) {
-  assert(E->getType()->isRecordType());
-  const Record *R = getRecord(E->getType());
+  if (E->getType()->isVoidType())
+    return this->emitInvalid(E);
 
-  if (Inits.size() == 1 && E->getType() == Inits[0]->getType()) {
-    return this->visitInitializer(Inits[0]);
+  // Handle discarding first.
+  if (DiscardResult) {
+    for (const Expr *Init : Inits) {
+      if (!this->discard(Init))
+        return false;
+    }
+    return true;
   }
 
-  unsigned InitIndex = 0;
-  for (const Expr *Init : Inits) {
-    // Skip unnamed bitfields.
-    while (InitIndex < R->getNumFields() &&
-           R->getField(InitIndex)->Decl->isUnnamedBitField())
-      ++InitIndex;
+  // Primitive values.
+  if (std::optional<PrimType> T = classify(E->getType())) {
+    assert(!DiscardResult);
+    if (Inits.size() == 0)
+      return this->visitZeroInitializer(*T, E->getType(), E);
+    assert(Inits.size() == 1);
+    return this->delegate(Inits[0]);
+  }
 
-    if (!this->emitDupPtr(E))
-      return false;
+  QualType T = E->getType();
+  if (T->isRecordType()) {
+    const Record *R = getRecord(E->getType());
 
-    if (std::optional<PrimType> T = classify(Init)) {
-      const Record::Field *FieldToInit = R->getField(InitIndex);
-      if (!this->visit(Init))
-        return false;
+    if (Inits.size() == 1 && E->getType() == Inits[0]->getType()) {
+      return this->visitInitializer(Inits[0]);
+    }
 
-      if (FieldToInit->isBitField()) {
-        if (!this->emitInitBitField(*T, FieldToInit, E))
-          return false;
-      } else {
-        if (!this->emitInitField(*T, FieldToInit->Offset, E))
-          return false;
-      }
+    unsigned InitIndex = 0;
+    for (const Expr *Init : Inits) {
+      // Skip unnamed bitfields.
+      while (InitIndex < R->getNumFields() &&
+             R->getField(InitIndex)->Decl->isUnnamedBitField())
+        ++InitIndex;
 
-      if (!this->emitPopPtr(E))
+      if (!this->emitDupPtr(E))
         return false;
-      ++InitIndex;
-    } else {
-      // Initializer for a direct base class.
-      if (const Record::Base *B = R->getBase(Init->getType())) {
-        if (!this->emitGetPtrBasePop(B->Offset, Init))
-          return false;
-
-        if (!this->visitInitializer(Init))
-          return false;
 
-        if (!this->emitFinishInitPop(E))
-          return false;
-        // Base initializers don't increase InitIndex, since they don't count
-        // into the Record's fields.
-      } else {
+      if (std::optional<PrimType> T = classify(Init)) {
         const Record::Field *FieldToInit = R->getField(InitIndex);
-        // Non-primitive case. Get a pointer to the field-to-initialize
-        // on the stack and recurse into visitInitializer().
-        if (!this->emitGetPtrField(FieldToInit->Offset, Init))
+        if (!this->visit(Init))
           return false;
 
-        if (!this->visitInitializer(Init))
-          return false;
+        if (FieldToInit->isBitField()) {
+          if (!this->emitInitBitField(*T, FieldToInit, E))
+            return false;
+        } else {
+          if (!this->emitInitField(*T, FieldToInit->Offset, E))
+            return false;
+        }
 
         if (!this->emitPopPtr(E))
           return false;
         ++InitIndex;
-      }
-    }
-  }
-  return true;
-}
+      } else {
+        // Initializer for a direct base class.
+        if (const Record::Base *B = R->getBase(Init->getType())) {
+          if (!this->emitGetPtrBasePop(B->Offset, Init))
+            return false;
 
-/// Pointer to the array(not the element!) must be on the stack when calling
-/// this.
-template <class Emitter>
-bool ByteCodeExprGen<Emitter>::visitArrayElemInit(unsigned ElemIndex,
-                                                  const Expr *Init) {
-  if (std::optional<PrimType> T = classify(Init->getType())) {
-    // Visit the primitive element like normal.
-    if (!this->visit(Init))
-      return false;
-    return this->emitInitElem(*T, ElemIndex, Init);
-  }
+          if (!this->visitInitializer(Init))
+            return false;
 
-  // Advance the pointer currently on the stack to the given
-  // dimension.
-  if (!this->emitConstUint32(ElemIndex, Init))
-    return false;
-  if (!this->emitArrayElemPtrUint32(Init))
-    return false;
-  if (!this->visitInitializer(Init))
-    return false;
-  return this->emitFinishInitPop(Init);
-}
+          if (!this->emitFinishInitPop(E))
+            return false;
+          // Base initializers don't increase InitIndex, since they don't count
+          // into the Record's fields.
+        } else {
+          const Record::Field *FieldToInit = R->getField(InitIndex);
+          // Non-primitive case. Get a pointer to the field-to-initialize
+          // on the stack and recurse into visitInitializer().
+          if (!this->emitGetPtrField(FieldToInit->Offset, Init))
+            return false;
 
-template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitInitListExpr(const InitListExpr *E) {
-  // Handle discarding first.
-  if (DiscardResult) {
-    for (const Expr *Init : E->inits()) {
-      if (!this->discard(Init))
-        return false;
+          if (!this->visitInitializer(Init))
+            return false;
+
+          if (!this->emitPopPtr(E))
+            return false;
+          ++InitIndex;
+        }
+      }
     }
     return true;
   }
 
-  // Primitive values.
-  if (std::optional<PrimType> T = classify(E->getType())) {
-    assert(!DiscardResult);
-    if (E->getNumInits() == 0)
-      return this->visitZeroInitializer(*T, E->getType(), E);
-    assert(E->getNumInits() == 1);
-    return this->delegate(E->inits()[0]);
-  }
-
-  QualType T = E->getType();
-  if (T->isRecordType())
-    return this->visitInitList(E->inits(), E);
-
   if (T->isArrayType()) {
     unsigned ElementIndex = 0;
-    for (const Expr *Init : E->inits()) {
+    for (const Expr *Init : Inits) {
       if (!this->visitArrayElemInit(ElementIndex, Init))
         return false;
       ++ElementIndex;
@@ -1123,13 +1129,13 @@ bool ByteCodeExprGen<Emitter>::VisitInitListExpr(const InitListExpr *E) {
 
     // Expand the filler expression.
     // FIXME: This should go away.
-    if (const Expr *Filler = E->getArrayFiller()) {
+    if (ArrayFiller) {
       const ConstantArrayType *CAT =
           Ctx.getASTContext().getAsConstantArrayType(E->getType());
       uint64_t NumElems = CAT->getZExtSize();
 
       for (; ElementIndex != NumElems; ++ElementIndex) {
-        if (!this->visitArrayElemInit(ElementIndex, Filler))
+        if (!this->visitArrayElemInit(ElementIndex, ArrayFiller))
           return false;
       }
     }
@@ -1138,10 +1144,10 @@ bool ByteCodeExprGen<Emitter>::VisitInitListExpr(const InitListExpr *E) {
   }
 
   if (const auto *ComplexTy = E->getType()->getAs<ComplexType>()) {
-    unsigned NumInits = E->getNumInits();
+    unsigned NumInits = Inits.size();
 
     if (NumInits == 1)
-      return this->delegate(E->inits()[0]);
+      return this->delegate(Inits[0]);
 
     QualType ElemQT = ComplexTy->getElementType();
     PrimType ElemT = classifyPrim(ElemQT);
@@ -1155,7 +1161,7 @@ bool ByteCodeExprGen<Emitter>::VisitInitListExpr(const InitListExpr *E) {
       }
     } else if (NumInits == 2) {
       unsigned InitIndex = 0;
-      for (const Expr *Init : E->inits()) {
+      for (const Expr *Init : Inits) {
         if (!this->visit(Init))
           return false;
 
@@ -1169,14 +1175,14 @@ bool ByteCodeExprGen<Emitter>::VisitInitListExpr(const InitListExpr *E) {
 
   if (const auto *VecT = E->getType()->getAs<VectorType>()) {
     unsigned NumVecElements = VecT->getNumElements();
-    assert(NumVecElements >= E->getNumInits());
+    assert(NumVecElements >= Inits.size());
 
     QualType ElemQT = VecT->getElementType();
     PrimType ElemT = classifyPrim(ElemQT);
 
     // All initializer elements.
     unsigned InitIndex = 0;
-    for (const Expr *Init : E->inits()) {
+    for (const Expr *Init : Inits) {
       if (!this->visit(Init))
         return false;
 
@@ -1198,19 +1204,38 @@ bool ByteCodeExprGen<Emitter>::VisitInitListExpr(const InitListExpr *E) {
   return false;
 }
 
+/// Pointer to the array(not the element!) must be on the stack when calling
+/// this.
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitCXXParenListInitExpr(
-    const CXXParenListInitExpr *E) {
-  if (DiscardResult) {
-    for (const Expr *Init : E->getInitExprs()) {
-      if (!this->discard(Init))
-        return false;
-    }
-    return true;
+bool ByteCodeExprGen<Emitter>::visitArrayElemInit(unsigned ElemIndex,
+                                                  const Expr *Init) {
+  if (std::optional<PrimType> T = classify(Init->getType())) {
+    // Visit the primitive element like normal.
+    if (!this->visit(Init))
+      return false;
+    return this->emitInitElem(*T, ElemIndex, Init);
   }
 
-  assert(E->getType()->isRecordType());
-  return this->visitInitList(E->getInitExprs(), E);
+  // Advance the pointer currently on the stack to the given
+  // dimension.
+  if (!this->emitConstUint32(ElemIndex, Init))
+    return false;
+  if (!this->emitArrayElemPtrUint32(Init))
+    return false;
+  if (!this->visitInitializer(Init))
+    return false;
+  return this->emitFinishInitPop(Init);
+}
+
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitInitListExpr(const InitListExpr *E) {
+  return this->visitInitList(E->inits(), E->getArrayFiller(), E);
+}
+
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitCXXParenListInitExpr(
+    const CXXParenListInitExpr *E) {
+  return this->visitInitList(E->getInitExprs(), E->getArrayFiller(), E);
 }
 
 template <class Emitter>
@@ -1333,6 +1358,20 @@ bool ByteCodeExprGen<Emitter>::VisitUnaryExprOrTypeTraitExpr(
     assert(E->getTypeOfArgument()->isSizelessVectorType());
   }
 
+  if (Kind == UETT_VecStep) {
+    if (const auto *VT = E->getTypeOfArgument()->getAs<VectorType>()) {
+      unsigned N = VT->getNumElements();
+
+      // The vec_step built-in functions that take a 3-component
+      // vector return 4. (OpenCL 1.1 spec 6.11.12)
+      if (N == 3)
+        N = 4;
+
+      return this->emitConst(N, E);
+    }
+    return this->emitConst(1, E);
+  }
+
   return false;
 }
 
@@ -2340,8 +2379,7 @@ bool ByteCodeExprGen<Emitter>::VisitCXXUuidofExpr(const CXXUuidofExpr *E) {
   if (!this->emitGetPtrGlobal(*GlobalIndex, E))
     return false;
 
-  const Record *R = this->getRecord(E->getType());
-  assert(R);
+  assert(this->getRecord(E->getType()));
 
   const APValue &V = E->getGuidDecl()->getAsAPValue();
   if (V.getKind() == APValue::None)
@@ -2349,41 +2387,8 @@ bool ByteCodeExprGen<Emitter>::VisitCXXUuidofExpr(const CXXUuidofExpr *E) {
 
   assert(V.isStruct());
   assert(V.getStructNumBases() == 0);
-  // FIXME: This could be useful in visitAPValue, too.
-  for (unsigned I = 0, N = V.getStructNumFields(); I != N; ++I) {
-    const APValue &F = V.getStructField(I);
-    const Record::Field *RF = R->getField(I);
-
-    if (F.isInt()) {
-      PrimType T = classifyPrim(RF->Decl->getType());
-      if (!this->visitAPValue(F, T, E))
-        return false;
-      if (!this->emitInitField(T, RF->Offset, E))
-        return false;
-    } else if (F.isArray()) {
-      assert(RF->Desc->isPrimitiveArray());
-      const auto *ArrType = RF->Decl->getType()->getAsArrayTypeUnsafe();
-      PrimType ElemT = classifyPrim(ArrType->getElementType());
-      assert(ArrType);
-
-      if (!this->emitDupPtr(E))
-        return false;
-      if (!this->emitGetPtrField(RF->Offset, E))
-        return false;
-
-      for (unsigned A = 0, AN = F.getArraySize(); A != AN; ++A) {
-        if (!this->visitAPValue(F.getArrayInitializedElt(A), ElemT, E))
-          return false;
-        if (!this->emitInitElem(ElemT, A, E))
-          return false;
-      }
-
-      if (!this->emitPopPtr(E))
-        return false;
-    } else {
-      assert(false && "I don't think this should be possible");
-    }
-  }
+  if (!this->visitAPValueInitializer(V, E))
+    return false;
 
   return this->emitFinishInit(E);
 }
@@ -2949,6 +2954,54 @@ bool ByteCodeExprGen<Emitter>::visitAPValue(const APValue &Val,
 }
 
 template <class Emitter>
+bool ByteCodeExprGen<Emitter>::visitAPValueInitializer(const APValue &Val,
+                                                       const Expr *E) {
+  if (Val.isStruct()) {
+    const Record *R = this->getRecord(E->getType());
+    assert(R);
+
+    for (unsigned I = 0, N = Val.getStructNumFields(); I != N; ++I) {
+      const APValue &F = Val.getStructField(I);
+      const Record::Field *RF = R->getField(I);
+
+      if (F.isInt()) {
+        PrimType T = classifyPrim(RF->Decl->getType());
+        if (!this->visitAPValue(F, T, E))
+          return false;
+        if (!this->emitInitField(T, RF->Offset, E))
+          return false;
+      } else if (F.isArray()) {
+        assert(RF->Desc->isPrimitiveArray());
+        const auto *ArrType = RF->Decl->getType()->getAsArrayTypeUnsafe();
+        PrimType ElemT = classifyPrim(ArrType->getElementType());
+        assert(ArrType);
+
+        if (!this->emitDupPtr(E))
+          return false;
+        if (!this->emitGetPtrField(RF->Offset, E))
+          return false;
+
+        for (unsigned A = 0, AN = F.getArraySize(); A != AN; ++A) {
+          if (!this->visitAPValue(F.getArrayInitializedElt(A), ElemT, E))
+            return false;
+          if (!this->emitInitElem(ElemT, A, E))
+            return false;
+        }
+
+        if (!this->emitPopPtr(E))
+          return false;
+      } else {
+        assert(false && "I don't think this should be possible");
+      }
+    }
+    return true;
+  }
+  // TODO: Other types.
+
+  return false;
+}
+
+template <class Emitter>
 bool ByteCodeExprGen<Emitter>::VisitBuiltinCallExpr(const CallExpr *E) {
   const Function *Func = getFunction(E->getDirectCallee());
   if (!Func)
@@ -3469,9 +3522,17 @@ bool ByteCodeExprGen<Emitter>::VisitDeclRefExpr(const DeclRefExpr *E) {
   } else if (const auto *FuncDecl = dyn_cast<FunctionDecl>(D)) {
     const Function *F = getFunction(FuncDecl);
     return F && this->emitGetFnPtr(F, E);
-  } else if (isa<TemplateParamObjectDecl>(D)) {
-    if (std::optional<unsigned> Index = P.getOrCreateGlobal(D))
-      return this->emitGetPtrGlobal(*Index, E);
+  } else if (const auto *TPOD = dyn_cast<TemplateParamObjectDecl>(D)) {
+    if (std::optional<unsigned> Index = P.getOrCreateGlobal(D)) {
+      if (!this->emitGetPtrGlobal(*Index, E))
+        return false;
+      if (std::optional<PrimType> T = classify(E->getType())) {
+        if (!this->visitAPValue(TPOD->getValue(), *T, E))
+          return false;
+        return this->emitInitGlobal(*T, *Index, E);
+      }
+      return this->visitAPValueInitializer(TPOD->getValue(), E);
+    }
     return false;
   }
 
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index 4a57f76ae5b3..a89e37c67aa6 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -181,6 +181,7 @@ protected:
   bool visitVarDecl(const VarDecl *VD);
   /// Visit an APValue.
   bool visitAPValue(const APValue &Val, PrimType ValType, const Expr *E);
+  bool visitAPValueInitializer(const APValue &Val, const Expr *E);
 
   /// Visits an expression and converts it to a boolean.
   bool visitBool(const Expr *E);
@@ -224,7 +225,8 @@ protected:
     return this->emitFinishInitPop(I);
   }
 
-  bool visitInitList(ArrayRef<const Expr *> Inits, const Expr *E);
+  bool visitInitList(ArrayRef<const Expr *> Inits, const Expr *ArrayFiller,
+                     const Expr *E);
   bool visitArrayElemInit(unsigned ElemIndex, const Expr *Init);
 
   /// Creates a local primitive value.
diff --git a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
index ec2fe39a8aea..ff91baf595f1 100644
--- a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
@@ -332,7 +332,8 @@ bool ByteCodeStmtGen<Emitter>::visitCompoundStmt(
 template <class Emitter>
 bool ByteCodeStmtGen<Emitter>::visitDeclStmt(const DeclStmt *DS) {
   for (auto *D : DS->decls()) {
-    if (isa<StaticAssertDecl, TagDecl, TypedefNameDecl, UsingEnumDecl>(D))
+    if (isa<StaticAssertDecl, TagDecl, TypedefNameDecl, UsingEnumDecl,
+            FunctionDecl>(D))
       continue;
 
     const auto *VD = dyn_cast<VarDecl>(D);
diff --git a/clang/lib/AST/Interp/Disasm.cpp b/clang/lib/AST/Interp/Disasm.cpp
index 01cc88ea9a84..ccdc96a79436 100644
--- a/clang/lib/AST/Interp/Disasm.cpp
+++ b/clang/lib/AST/Interp/Disasm.cpp
@@ -200,7 +200,7 @@ LLVM_DUMP_METHOD void Descriptor::dump(llvm::raw_ostream &OS) const {
     OS << " primitive";
 
   if (isZeroSizeArray())
-    OS << " zero-size-arrary";
+    OS << " zero-size-array";
   else if (isUnknownSizeArray())
     OS << " unknown-size-array";
 
diff --git a/clang/lib/AST/Interp/Program.cpp b/clang/lib/AST/Interp/Program.cpp
index 3773e0662f78..02075c20cf55 100644
--- a/clang/lib/AST/Interp/Program.cpp
+++ b/clang/lib/AST/Interp/Program.cpp
@@ -173,7 +173,8 @@ std::optional<unsigned> Program::createGlobal(const ValueDecl *VD,
   if (const auto *Var = dyn_cast<VarDecl>(VD)) {
     IsStatic = Context::shouldBeGloballyIndexed(VD);
     IsExtern = Var->hasExternalStorage();
-  } else if (isa<UnnamedGlobalConstantDecl, MSGuidDecl>(VD)) {
+  } else if (isa<UnnamedGlobalConstantDecl, MSGuidDecl,
+                 TemplateParamObjectDecl>(VD)) {
     IsStatic = true;
     IsExtern = false;
   } else {
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 8aaa6801d85b..68e81f45b4c2 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -2510,6 +2510,18 @@ bool Type::isSveVLSBuiltinType() const {
   return false;
 }
 
+QualType Type::getSizelessVectorEltType(const ASTContext &Ctx) const {
+  assert(isSizelessVectorType() && "Must be sizeless vector type");
+  // Currently supports SVE and RVV
+  if (isSVESizelessBuiltinType())
+    return getSveEltType(Ctx);
+
+  if (isRVVSizelessBuiltinType())
+    return getRVVEltType(Ctx);
+
+  llvm_unreachable("Unhandled type");
+}
+
 QualType Type::getSveEltType(const ASTContext &Ctx) const {
   assert(isSveVLSBuiltinType() && "unsupported type!");
 
diff --git a/clang/lib/Basic/Targets/WebAssembly.cpp b/clang/lib/Basic/Targets/WebAssembly.cpp
index 3d76411f890a..0db7b668d8a0 100644
--- a/clang/lib/Basic/Targets/WebAssembly.cpp
+++ b/clang/lib/Basic/Targets/WebAssembly.cpp
@@ -148,20 +148,26 @@ void WebAssemblyTargetInfo::setFeatureEnabled(llvm::StringMap<bool> &Features,
 bool WebAssemblyTargetInfo::initFeatureMap(
     llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags, StringRef CPU,
     const std::vector<std::string> &FeaturesVec) const {
-  if (CPU == "bleeding-edge") {
+  auto addGenericFeatures = [&]() {
+    Features["multivalue"] = true;
+    Features["mutable-globals"] = true;
+    Features["reference-types"] = true;
+    Features["sign-ext"] = true;
+  };
+  auto addBleedingEdgeFeatures = [&]() {
+    addGenericFeatures();
     Features["atomics"] = true;
     Features["bulk-memory"] = true;
     Features["multimemory"] = true;
-    Features["mutable-globals"] = true;
     Features["nontrapping-fptoint"] = true;
-    Features["reference-types"] = true;
-    Features["sign-ext"] = true;
     Features["tail-call"] = true;
     Features["half-precision"] = true;
     setSIMDLevel(Features, SIMD128, true);
-  } else if (CPU == "generic") {
-    Features["mutable-globals"] = true;
-    Features["sign-ext"] = true;
+  };
+  if (CPU == "generic") {
+    addGenericFeatures();
+  } else if (CPU == "bleeding-edge") {
+    addBleedingEdgeFeatures();
   }
 
   return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index d08ab5391489..a370734e00d3 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -3885,9 +3885,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   }
 
   case Builtin::BI__builtin_reduce_max: {
-    auto GetIntrinsicID = [](QualType QT) {
+    auto GetIntrinsicID = [this](QualType QT) {
       if (auto *VecTy = QT->getAs<VectorType>())
         QT = VecTy->getElementType();
+      else if (QT->isSizelessVectorType())
+        QT = QT->getSizelessVectorEltType(CGM.getContext());
+
       if (QT->isSignedIntegerType())
         return llvm::Intrinsic::vector_reduce_smax;
       if (QT->isUnsignedIntegerType())
@@ -3900,9 +3903,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   }
 
   case Builtin::BI__builtin_reduce_min: {
-    auto GetIntrinsicID = [](QualType QT) {
+    auto GetIntrinsicID = [this](QualType QT) {
       if (auto *VecTy = QT->getAs<VectorType>())
         QT = VecTy->getElementType();
+      else if (QT->isSizelessVectorType())
+        QT = QT->getSizelessVectorEltType(CGM.getContext());
+
       if (QT->isSignedIntegerType())
         return llvm::Intrinsic::vector_reduce_smin;
       if (QT->isUnsignedIntegerType())
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index d2d92140b6b2..69548902dc43 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -4698,11 +4698,11 @@ void CodeGenFunction::EmitCallArg(CallArgList &args, const Expr *E,
     AggValueSlot Slot = args.isUsingInAlloca()
         ? createPlaceholderSlot(*this, type) : CreateAggTemp(type, "agg.tmp");
 
-    bool DestroyedInCallee = true, NeedsEHCleanup = true;
+    bool DestroyedInCallee = true, NeedsCleanup = true;
     if (const auto *RD = type->getAsCXXRecordDecl())
       DestroyedInCallee = RD->hasNonTrivialDestructor();
     else
-      NeedsEHCleanup = needsEHCleanup(type.isDestructedType());
+      NeedsCleanup = type.isDestructedType();
 
     if (DestroyedInCallee)
       Slot.setExternallyDestructed();
@@ -4711,14 +4711,15 @@ void CodeGenFunction::EmitCallArg(CallArgList &args, const Expr *E,
     RValue RV = Slot.asRValue();
     args.add(RV, type);
 
-    if (DestroyedInCallee && NeedsEHCleanup) {
+    if (DestroyedInCallee && NeedsCleanup) {
       // Create a no-op GEP between the placeholder and the cleanup so we can
       // RAUW it successfully.  It also serves as a marker of the first
       // instruction where the cleanup is active.
-      pushFullExprCleanup<DestroyUnpassedArg>(EHCleanup, Slot.getAddress(),
-                                              type);
+      pushFullExprCleanup<DestroyUnpassedArg>(NormalAndEHCleanup,
+                                              Slot.getAddress(), type);
       // This unreachable is a temporary marker which will be removed later.
-      llvm::Instruction *IsActive = Builder.CreateUnreachable();
+      llvm::Instruction *IsActive =
+          Builder.CreateFlagLoad(llvm::Constant::getNullValue(Int8PtrTy));
       args.addArgCleanupDeactivation(EHStack.stable_begin(), IsActive);
     }
     return;
diff --git a/clang/lib/CodeGen/CGCleanup.cpp b/clang/lib/CodeGen/CGCleanup.cpp
index e6f8e6873004..469e0363b744 100644
--- a/clang/lib/CodeGen/CGCleanup.cpp
+++ b/clang/lib/CodeGen/CGCleanup.cpp
@@ -634,12 +634,19 @@ static void destroyOptimisticNormalEntry(CodeGenFunction &CGF,
 /// Pops a cleanup block.  If the block includes a normal cleanup, the
 /// current insertion point is threaded through the cleanup, as are
 /// any branch fixups on the cleanup.
-void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
+void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough,
+                                      bool ForDeactivation) {
   assert(!EHStack.empty() && "cleanup stack is empty!");
   assert(isa<EHCleanupScope>(*EHStack.begin()) && "top not a cleanup!");
   EHCleanupScope &Scope = cast<EHCleanupScope>(*EHStack.begin());
   assert(Scope.getFixupDepth() <= EHStack.getNumBranchFixups());
 
+  // If we are deactivating a normal cleanup, we need to pretend that the
+  // fallthrough is unreachable. We restore this IP before returning.
+  CGBuilderTy::InsertPoint NormalDeactivateOrigIP;
+  if (ForDeactivation && (Scope.isNormalCleanup() || !getLangOpts().EHAsynch)) {
+    NormalDeactivateOrigIP = Builder.saveAndClearIP();
+  }
   // Remember activation information.
   bool IsActive = Scope.isActive();
   Address NormalActiveFlag =
@@ -667,7 +674,8 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
 
   // - whether there's a fallthrough
   llvm::BasicBlock *FallthroughSource = Builder.GetInsertBlock();
-  bool HasFallthrough = (FallthroughSource != nullptr && IsActive);
+  bool HasFallthrough =
+      FallthroughSource != nullptr && (IsActive || HasExistingBranches);
 
   // Branch-through fall-throughs leave the insertion point set to the
   // end of the last cleanup, which points to the current scope.  The
@@ -692,7 +700,11 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
 
   // If we have a prebranched fallthrough into an inactive normal
   // cleanup, rewrite it so that it leads to the appropriate place.
-  if (Scope.isNormalCleanup() && HasPrebranchedFallthrough && !IsActive) {
+  if (Scope.isNormalCleanup() && HasPrebranchedFallthrough &&
+      !RequiresNormalCleanup) {
+    // FIXME: Come up with a program which would need forwarding prebranched
+    // fallthrough and add tests. Otherwise delete this and assert against it.
+    assert(!IsActive);
     llvm::BasicBlock *prebranchDest;
 
     // If the prebranch is semantically branching through the next
@@ -724,6 +736,8 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
     EHStack.popCleanup(); // safe because there are no fixups
     assert(EHStack.getNumBranchFixups() == 0 ||
            EHStack.hasNormalCleanups());
+    if (NormalDeactivateOrigIP.isSet())
+      Builder.restoreIP(NormalDeactivateOrigIP);
     return;
   }
 
@@ -760,11 +774,19 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
   if (!RequiresNormalCleanup) {
     // Mark CPP scope end for passed-by-value Arg temp
     //   per Windows ABI which is "normally" Cleanup in callee
-    if (IsEHa && getInvokeDest() && Builder.GetInsertBlock()) {
-      if (Personality.isMSVCXXPersonality())
+    if (IsEHa && getInvokeDest()) {
+      // If we are deactivating a normal cleanup then we don't have a
+      // fallthrough. Restore original IP to emit CPP scope ends in the correct
+      // block.
+      if (NormalDeactivateOrigIP.isSet())
+        Builder.restoreIP(NormalDeactivateOrigIP);
+      if (Personality.isMSVCXXPersonality() && Builder.GetInsertBlock())
         EmitSehCppScopeEnd();
+      if (NormalDeactivateOrigIP.isSet())
+        NormalDeactivateOrigIP = Builder.saveAndClearIP();
     }
     destroyOptimisticNormalEntry(*this, Scope);
+    Scope.MarkEmitted();
     EHStack.popCleanup();
   } else {
     // If we have a fallthrough and no other need for the cleanup,
@@ -781,6 +803,7 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
       }
 
       destroyOptimisticNormalEntry(*this, Scope);
+      Scope.MarkEmitted();
       EHStack.popCleanup();
 
       EmitCleanup(*this, Fn, cleanupFlags, NormalActiveFlag);
@@ -916,6 +939,7 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
       }
 
       // IV.  Pop the cleanup and emit it.
+      Scope.MarkEmitted();
       EHStack.popCleanup();
       assert(EHStack.hasNormalCleanups() == HasEnclosingCleanups);
 
@@ -984,6 +1008,8 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
     }
   }
 
+  if (NormalDeactivateOrigIP.isSet())
+    Builder.restoreIP(NormalDeactivateOrigIP);
   assert(EHStack.hasNormalCleanups() || EHStack.getNumBranchFixups() == 0);
 
   // Emit the EH cleanup if required.
@@ -1143,25 +1169,6 @@ void CodeGenFunction::EmitBranchThroughCleanup(JumpDest Dest) {
   Builder.ClearInsertionPoint();
 }
 
-static bool IsUsedAsNormalCleanup(EHScopeStack &EHStack,
-                                  EHScopeStack::stable_iterator C) {
-  // If we needed a normal block for any reason, that counts.
-  if (cast<EHCleanupScope>(*EHStack.find(C)).getNormalBlock())
-    return true;
-
-  // Check whether any enclosed cleanups were needed.
-  for (EHScopeStack::stable_iterator
-         I = EHStack.getInnermostNormalCleanup();
-         I != C; ) {
-    assert(C.strictlyEncloses(I));
-    EHCleanupScope &S = cast<EHCleanupScope>(*EHStack.find(I));
-    if (S.getNormalBlock()) return true;
-    I = S.getEnclosingNormalCleanup();
-  }
-
-  return false;
-}
-
 static bool IsUsedAsEHCleanup(EHScopeStack &EHStack,
                               EHScopeStack::stable_iterator cleanup) {
   // If we needed an EH block for any reason, that counts.
@@ -1210,8 +1217,7 @@ static void SetupCleanupBlockActivation(CodeGenFunction &CGF,
   // Calculate whether the cleanup was used:
 
   //   - as a normal cleanup
-  if (Scope.isNormalCleanup() &&
-      (isActivatedInConditional || IsUsedAsNormalCleanup(CGF.EHStack, C))) {
+  if (Scope.isNormalCleanup()) {
     Scope.setTestFlagInNormalCleanup();
     needFlag = true;
   }
@@ -1224,13 +1230,16 @@ static void SetupCleanupBlockActivation(CodeGenFunction &CGF,
   }
 
   // If it hasn't yet been used as either, we're done.
-  if (!needFlag) return;
+  if (!needFlag)
+    return;
 
   Address var = Scope.getActiveFlag();
   if (!var.isValid()) {
+    CodeGenFunction::AllocaTrackerRAII AllocaTracker(CGF);
     var = CGF.CreateTempAlloca(CGF.Builder.getInt1Ty(), CharUnits::One(),
                                "cleanup.isactive");
     Scope.setActiveFlag(var);
+    Scope.AddAuxAllocas(AllocaTracker.Take());
 
     assert(dominatingIP && "no existing variable and no dominating IP!");
 
@@ -1273,17 +1282,8 @@ void CodeGenFunction::DeactivateCleanupBlock(EHScopeStack::stable_iterator C,
   // to the current RunCleanupsScope.
   if (C == EHStack.stable_begin() &&
       CurrentCleanupScopeDepth.strictlyEncloses(C)) {
-    // Per comment below, checking EHAsynch is not really necessary
-    // it's there to assure zero-impact w/o EHAsynch option
-    if (!Scope.isNormalCleanup() && getLangOpts().EHAsynch) {
-      PopCleanupBlock();
-    } else {
-      // If it's a normal cleanup, we need to pretend that the
-      // fallthrough is unreachable.
-      CGBuilderTy::InsertPoint SavedIP = Builder.saveAndClearIP();
-      PopCleanupBlock();
-      Builder.restoreIP(SavedIP);
-    }
+    PopCleanupBlock(/*FallthroughIsBranchThrough=*/false,
+                    /*ForDeactivation=*/true);
     return;
   }
 
diff --git a/clang/lib/CodeGen/CGCleanup.h b/clang/lib/CodeGen/CGCleanup.h
index 03e4a29d7b3d..c73c97146abc 100644
--- a/clang/lib/CodeGen/CGCleanup.h
+++ b/clang/lib/CodeGen/CGCleanup.h
@@ -16,8 +16,11 @@
 #include "EHScopeStack.h"
 
 #include "Address.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Instruction.h"
 
 namespace llvm {
 class BasicBlock;
@@ -266,6 +269,51 @@ class alignas(8) EHCleanupScope : public EHScope {
   };
   mutable struct ExtInfo *ExtInfo;
 
+  /// Erases auxillary allocas and their usages for an unused cleanup.
+  /// Cleanups should mark these allocas as 'used' if the cleanup is
+  /// emitted, otherwise these instructions would be erased.
+  struct AuxillaryAllocas {
+    SmallVector<llvm::Instruction *, 1> AuxAllocas;
+    bool used = false;
+
+    // Records a potentially unused instruction to be erased later.
+    void Add(llvm::AllocaInst *Alloca) { AuxAllocas.push_back(Alloca); }
+
+    // Mark all recorded instructions as used. These will not be erased later.
+    void MarkUsed() {
+      used = true;
+      AuxAllocas.clear();
+    }
+
+    ~AuxillaryAllocas() {
+      if (used)
+        return;
+      llvm::SetVector<llvm::Instruction *> Uses;
+      for (auto *Inst : llvm::reverse(AuxAllocas))
+        CollectUses(Inst, Uses);
+      // Delete uses in the reverse order of insertion.
+      for (auto *I : llvm::reverse(Uses))
+        I->eraseFromParent();
+    }
+
+  private:
+    void CollectUses(llvm::Instruction *I,
+                     llvm::SetVector<llvm::Instruction *> &Uses) {
+      if (!I || !Uses.insert(I))
+        return;
+      for (auto *User : I->users())
+        CollectUses(cast<llvm::Instruction>(User), Uses);
+    }
+  };
+  mutable struct AuxillaryAllocas *AuxAllocas;
+
+  AuxillaryAllocas &getAuxillaryAllocas() {
+    if (!AuxAllocas) {
+      AuxAllocas = new struct AuxillaryAllocas();
+    }
+    return *AuxAllocas;
+  }
+
   /// The number of fixups required by enclosing scopes (not including
   /// this one).  If this is the top cleanup scope, all the fixups
   /// from this index onwards belong to this scope.
@@ -298,7 +346,7 @@ public:
                  EHScopeStack::stable_iterator enclosingEH)
       : EHScope(EHScope::Cleanup, enclosingEH),
         EnclosingNormal(enclosingNormal), NormalBlock(nullptr),
-        ActiveFlag(Address::invalid()), ExtInfo(nullptr),
+        ActiveFlag(Address::invalid()), ExtInfo(nullptr), AuxAllocas(nullptr),
         FixupDepth(fixupDepth) {
     CleanupBits.IsNormalCleanup = isNormal;
     CleanupBits.IsEHCleanup = isEH;
@@ -312,8 +360,15 @@ public:
   }
 
   void Destroy() {
+    if (AuxAllocas)
+      delete AuxAllocas;
     delete ExtInfo;
   }
+  void AddAuxAllocas(llvm::SmallVector<llvm::AllocaInst *> Allocas) {
+    for (auto *Alloca : Allocas)
+      getAuxillaryAllocas().Add(Alloca);
+  }
+  void MarkEmitted() { getAuxillaryAllocas().MarkUsed(); }
   // Objects of EHCleanupScope are not destructed. Use Destroy().
   ~EHCleanupScope() = delete;
 
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index ce6d6d895607..9cc67cdbe424 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -19,6 +19,7 @@
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "ConstantEmitter.h"
+#include "EHScopeStack.h"
 #include "PatternInit.h"
 #include "TargetInfo.h"
 #include "clang/AST/ASTContext.h"
@@ -35,6 +36,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
 #include <optional>
@@ -2201,6 +2203,27 @@ void CodeGenFunction::pushDestroy(CleanupKind cleanupKind, Address addr,
                                      destroyer, useEHCleanupForArray);
 }
 
+// Pushes a destroy and defers its deactivation until its
+// CleanupDeactivationScope is exited.
+void CodeGenFunction::pushDestroyAndDeferDeactivation(
+    QualType::DestructionKind dtorKind, Address addr, QualType type) {
+  assert(dtorKind && "cannot push destructor for trivial type");
+
+  CleanupKind cleanupKind = getCleanupKind(dtorKind);
+  pushDestroyAndDeferDeactivation(
+      cleanupKind, addr, type, getDestroyer(dtorKind), cleanupKind & EHCleanup);
+}
+
+void CodeGenFunction::pushDestroyAndDeferDeactivation(
+    CleanupKind cleanupKind, Address addr, QualType type, Destroyer *destroyer,
+    bool useEHCleanupForArray) {
+  llvm::Instruction *DominatingIP =
+      Builder.CreateFlagLoad(llvm::Constant::getNullValue(Int8PtrTy));
+  pushDestroy(cleanupKind, addr, type, destroyer, useEHCleanupForArray);
+  DeferredDeactivationCleanupStack.push_back(
+      {EHStack.stable_begin(), DominatingIP});
+}
+
 void CodeGenFunction::pushStackRestore(CleanupKind Kind, Address SPMem) {
   EHStack.pushCleanup<CallStackRestore>(Kind, SPMem);
 }
@@ -2217,39 +2240,48 @@ void CodeGenFunction::pushLifetimeExtendedDestroy(CleanupKind cleanupKind,
   // If we're not in a conditional branch, we don't need to bother generating a
   // conditional cleanup.
   if (!isInConditionalBranch()) {
-    // Push an EH-only cleanup for the object now.
     // FIXME: When popping normal cleanups, we need to keep this EH cleanup
     // around in case a temporary's destructor throws an exception.
-    if (cleanupKind & EHCleanup)
-      EHStack.pushCleanup<DestroyObject>(
-          static_cast<CleanupKind>(cleanupKind & ~NormalCleanup), addr, type,
-          destroyer, useEHCleanupForArray);
 
+    // Add the cleanup to the EHStack. After the full-expr, this would be
+    // deactivated before being popped from the stack.
+    pushDestroyAndDeferDeactivation(cleanupKind, addr, type, destroyer,
+                                    useEHCleanupForArray);
+
+    // Since this is lifetime-extended, push it once again to the EHStack after
+    // the full expression.
     return pushCleanupAfterFullExprWithActiveFlag<DestroyObject>(
-        cleanupKind, Address::invalid(), addr, type, destroyer, useEHCleanupForArray);
+        cleanupKind, Address::invalid(), addr, type, destroyer,
+        useEHCleanupForArray);
   }
 
   // Otherwise, we should only destroy the object if it's been initialized.
-  // Re-use the active flag and saved address across both the EH and end of
-  // scope cleanups.
 
-  using SavedType = typename DominatingValue<Address>::saved_type;
   using ConditionalCleanupType =
       EHScopeStack::ConditionalCleanup<DestroyObject, Address, QualType,
                                        Destroyer *, bool>;
-
-  Address ActiveFlag = createCleanupActiveFlag();
-  SavedType SavedAddr = saveValueInCond(addr);
-
-  if (cleanupKind & EHCleanup) {
-    EHStack.pushCleanup<ConditionalCleanupType>(
-        static_cast<CleanupKind>(cleanupKind & ~NormalCleanup), SavedAddr, type,
-        destroyer, useEHCleanupForArray);
-    initFullExprCleanupWithFlag(ActiveFlag);
-  }
-
+  DominatingValue<Address>::saved_type SavedAddr = saveValueInCond(addr);
+
+  // Remember to emit cleanup if we branch-out before end of full-expression
+  // (eg: through stmt-expr or coro suspensions).
+  AllocaTrackerRAII DeactivationAllocas(*this);
+  Address ActiveFlagForDeactivation = createCleanupActiveFlag();
+
+  pushCleanupAndDeferDeactivation<ConditionalCleanupType>(
+      cleanupKind, SavedAddr, type, destroyer, useEHCleanupForArray);
+  initFullExprCleanupWithFlag(ActiveFlagForDeactivation);
+  EHCleanupScope &cleanup = cast<EHCleanupScope>(*EHStack.begin());
+  // Erase the active flag if the cleanup was not emitted.
+  cleanup.AddAuxAllocas(std::move(DeactivationAllocas).Take());
+
+  // Since this is lifetime-extended, push it once again to the EHStack after
+  // the full expression.
+  // The previous active flag would always be 'false' due to forced deferred
+  // deactivation. Use a separate flag for lifetime-extension to correctly
+  // remember if this branch was taken and the object was initialized.
+  Address ActiveFlagForLifetimeExt = createCleanupActiveFlag();
   pushCleanupAfterFullExprWithActiveFlag<ConditionalCleanupType>(
-      cleanupKind, ActiveFlag, SavedAddr, type, destroyer,
+      cleanupKind, ActiveFlagForLifetimeExt, SavedAddr, type, destroyer,
       useEHCleanupForArray);
 }
 
@@ -2442,9 +2474,9 @@ namespace {
   };
 } // end anonymous namespace
 
-/// pushIrregularPartialArrayCleanup - Push an EH cleanup to destroy
-/// already-constructed elements of the given array.  The cleanup
-/// may be popped with DeactivateCleanupBlock or PopCleanupBlock.
+/// pushIrregularPartialArrayCleanup - Push a NormalAndEHCleanup to
+/// destroy already-constructed elements of the given array.  The cleanup may be
+/// popped with DeactivateCleanupBlock or PopCleanupBlock.
 ///
 /// \param elementType - the immediate element type of the array;
 ///   possibly still an array type
@@ -2453,10 +2485,9 @@ void CodeGenFunction::pushIrregularPartialArrayCleanup(llvm::Value *arrayBegin,
                                                        QualType elementType,
                                                        CharUnits elementAlign,
                                                        Destroyer *destroyer) {
-  pushFullExprCleanup<IrregularPartialArrayDestroy>(EHCleanup,
-                                                    arrayBegin, arrayEndPointer,
-                                                    elementType, elementAlign,
-                                                    destroyer);
+  pushFullExprCleanup<IrregularPartialArrayDestroy>(
+      NormalAndEHCleanup, arrayBegin, arrayEndPointer, elementType,
+      elementAlign, destroyer);
 }
 
 /// pushRegularPartialArrayCleanup - Push an EH cleanup to destroy
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index c94322f51e46..d96c7bb1e568 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -115,10 +115,16 @@ RawAddress CodeGenFunction::CreateTempAlloca(llvm::Type *Ty, CharUnits Align,
 llvm::AllocaInst *CodeGenFunction::CreateTempAlloca(llvm::Type *Ty,
                                                     const Twine &Name,
                                                     llvm::Value *ArraySize) {
+  llvm::AllocaInst *Alloca;
   if (ArraySize)
-    return Builder.CreateAlloca(Ty, ArraySize, Name);
-  return new llvm::AllocaInst(Ty, CGM.getDataLayout().getAllocaAddrSpace(),
-                              ArraySize, Name, AllocaInsertPt);
+    Alloca = Builder.CreateAlloca(Ty, ArraySize, Name);
+  else
+    Alloca = new llvm::AllocaInst(Ty, CGM.getDataLayout().getAllocaAddrSpace(),
+                                  ArraySize, Name, AllocaInsertPt);
+  if (Allocas) {
+    Allocas->Add(Alloca);
+  }
+  return Alloca;
 }
 
 /// CreateDefaultAlignTempAlloca - This creates an alloca with the
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index 355fec42be44..44d476976a55 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -15,6 +15,7 @@
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "ConstantEmitter.h"
+#include "EHScopeStack.h"
 #include "TargetInfo.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Attr.h"
@@ -24,6 +25,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 using namespace clang;
@@ -558,24 +560,27 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, llvm::ArrayType *AType,
   // For that, we'll need an EH cleanup.
   QualType::DestructionKind dtorKind = elementType.isDestructedType();
   Address endOfInit = Address::invalid();
-  EHScopeStack::stable_iterator cleanup;
-  llvm::Instruction *cleanupDominator = nullptr;
-  if (CGF.needsEHCleanup(dtorKind)) {
+  CodeGenFunction::CleanupDeactivationScope deactivation(CGF);
+
+  if (dtorKind) {
+    CodeGenFunction::AllocaTrackerRAII allocaTracker(CGF);
     // In principle we could tell the cleanup where we are more
     // directly, but the control flow can get so varied here that it
     // would actually be quite complex.  Therefore we go through an
     // alloca.
+    llvm::Instruction *dominatingIP =
+        Builder.CreateFlagLoad(llvm::ConstantInt::getNullValue(CGF.Int8PtrTy));
     endOfInit = CGF.CreateTempAlloca(begin->getType(), CGF.getPointerAlign(),
                                      "arrayinit.endOfInit");
-    cleanupDominator = Builder.CreateStore(begin, endOfInit);
+    Builder.CreateStore(begin, endOfInit);
     CGF.pushIrregularPartialArrayCleanup(begin, endOfInit, elementType,
                                          elementAlign,
                                          CGF.getDestroyer(dtorKind));
-    cleanup = CGF.EHStack.stable_begin();
+    cast<EHCleanupScope>(*CGF.EHStack.find(CGF.EHStack.stable_begin()))
+        .AddAuxAllocas(allocaTracker.Take());
 
-  // Otherwise, remember that we didn't need a cleanup.
-  } else {
-    dtorKind = QualType::DK_none;
+    CGF.DeferredDeactivationCleanupStack.push_back(
+        {CGF.EHStack.stable_begin(), dominatingIP});
   }
 
   llvm::Value *one = llvm::ConstantInt::get(CGF.SizeTy, 1);
@@ -671,9 +676,6 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, llvm::ArrayType *AType,
 
     CGF.EmitBlock(endBB);
   }
-
-  // Leave the partial-array cleanup if we entered one.
-  if (dtorKind) CGF.DeactivateCleanupBlock(cleanup, cleanupDominator);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1374,9 +1376,8 @@ AggExprEmitter::VisitLambdaExpr(LambdaExpr *E) {
   LValue SlotLV = CGF.MakeAddrLValue(Slot.getAddress(), E->getType());
 
   // We'll need to enter cleanup scopes in case any of the element
-  // initializers throws an exception.
-  SmallVector<EHScopeStack::stable_iterator, 16> Cleanups;
-  llvm::Instruction *CleanupDominator = nullptr;
+  // initializers throws an exception or contains branch out of the expressions.
+  CodeGenFunction::CleanupDeactivationScope scope(CGF);
 
   CXXRecordDecl::field_iterator CurField = E->getLambdaClass()->field_begin();
   for (LambdaExpr::const_capture_init_iterator i = E->capture_init_begin(),
@@ -1395,28 +1396,12 @@ AggExprEmitter::VisitLambdaExpr(LambdaExpr *E) {
     if (QualType::DestructionKind DtorKind =
             CurField->getType().isDestructedType()) {
       assert(LV.isSimple());
-      if (CGF.needsEHCleanup(DtorKind)) {
-        if (!CleanupDominator)
-          CleanupDominator = CGF.Builder.CreateAlignedLoad(
-              CGF.Int8Ty,
-              llvm::Constant::getNullValue(CGF.Int8PtrTy),
-              CharUnits::One()); // placeholder
-
-        CGF.pushDestroy(EHCleanup, LV.getAddress(CGF), CurField->getType(),
-                        CGF.getDestroyer(DtorKind), false);
-        Cleanups.push_back(CGF.EHStack.stable_begin());
-      }
+      if (DtorKind)
+        CGF.pushDestroyAndDeferDeactivation(
+            NormalAndEHCleanup, LV.getAddress(CGF), CurField->getType(),
+            CGF.getDestroyer(DtorKind), false);
     }
   }
-
-  // Deactivate all the partial cleanups in reverse order, which
-  // generally means popping them.
-  for (unsigned i = Cleanups.size(); i != 0; --i)
-    CGF.DeactivateCleanupBlock(Cleanups[i-1], CleanupDominator);
-
-  // Destroy the placeholder if we made one.
-  if (CleanupDominator)
-    CleanupDominator->eraseFromParent();
 }
 
 void AggExprEmitter::VisitExprWithCleanups(ExprWithCleanups *E) {
@@ -1705,14 +1690,7 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr(
   // We'll need to enter cleanup scopes in case any of the element
   // initializers throws an exception.
   SmallVector<EHScopeStack::stable_iterator, 16> cleanups;
-  llvm::Instruction *cleanupDominator = nullptr;
-  auto addCleanup = [&](const EHScopeStack::stable_iterator &cleanup) {
-    cleanups.push_back(cleanup);
-    if (!cleanupDominator) // create placeholder once needed
-      cleanupDominator = CGF.Builder.CreateAlignedLoad(
-          CGF.Int8Ty, llvm::Constant::getNullValue(CGF.Int8PtrTy),
-          CharUnits::One());
-  };
+  CodeGenFunction::CleanupDeactivationScope DeactivateCleanups(CGF);
 
   unsigned curInitIndex = 0;
 
@@ -1735,10 +1713,8 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr(
       CGF.EmitAggExpr(InitExprs[curInitIndex++], AggSlot);
 
       if (QualType::DestructionKind dtorKind =
-              Base.getType().isDestructedType()) {
-        CGF.pushDestroy(dtorKind, V, Base.getType());
-        addCleanup(CGF.EHStack.stable_begin());
-      }
+              Base.getType().isDestructedType())
+        CGF.pushDestroyAndDeferDeactivation(dtorKind, V, Base.getType());
     }
   }
 
@@ -1815,10 +1791,10 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr(
     if (QualType::DestructionKind dtorKind
           = field->getType().isDestructedType()) {
       assert(LV.isSimple());
-      if (CGF.needsEHCleanup(dtorKind)) {
-        CGF.pushDestroy(EHCleanup, LV.getAddress(CGF), field->getType(),
-                        CGF.getDestroyer(dtorKind), false);
-        addCleanup(CGF.EHStack.stable_begin());
+      if (dtorKind) {
+        CGF.pushDestroyAndDeferDeactivation(
+            NormalAndEHCleanup, LV.getAddress(CGF), field->getType(),
+            CGF.getDestroyer(dtorKind), false);
         pushedCleanup = true;
       }
     }
@@ -1831,17 +1807,6 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr(
         if (GEP->use_empty())
           GEP->eraseFromParent();
   }
-
-  // Deactivate all the partial cleanups in reverse order, which
-  // generally means popping them.
-  assert((cleanupDominator || cleanups.empty()) &&
-         "Missing cleanupDominator before deactivating cleanup blocks");
-  for (unsigned i = cleanups.size(); i != 0; --i)
-    CGF.DeactivateCleanupBlock(cleanups[i-1], cleanupDominator);
-
-  // Destroy the placeholder if we made one.
-  if (cleanupDominator)
-    cleanupDominator->eraseFromParent();
 }
 
 void AggExprEmitter::VisitArrayInitLoopExpr(const ArrayInitLoopExpr *E,
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index 673ccef84d67..c18c36d3f3f3 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -1008,8 +1008,8 @@ void CodeGenFunction::EmitNewArrayInitializer(
   const Expr *Init = E->getInitializer();
   Address EndOfInit = Address::invalid();
   QualType::DestructionKind DtorKind = ElementType.isDestructedType();
-  EHScopeStack::stable_iterator Cleanup;
-  llvm::Instruction *CleanupDominator = nullptr;
+  CleanupDeactivationScope deactivation(*this);
+  bool pushedCleanup = false;
 
   CharUnits ElementSize = getContext().getTypeSizeInChars(ElementType);
   CharUnits ElementAlign =
@@ -1105,19 +1105,24 @@ void CodeGenFunction::EmitNewArrayInitializer(
     }
 
     // Enter a partial-destruction Cleanup if necessary.
-    if (needsEHCleanup(DtorKind)) {
+    if (DtorKind) {
+      AllocaTrackerRAII AllocaTracker(*this);
       // In principle we could tell the Cleanup where we are more
       // directly, but the control flow can get so varied here that it
       // would actually be quite complex.  Therefore we go through an
       // alloca.
+      llvm::Instruction *DominatingIP =
+          Builder.CreateFlagLoad(llvm::ConstantInt::getNullValue(Int8PtrTy));
       EndOfInit = CreateTempAlloca(BeginPtr.getType(), getPointerAlign(),
                                    "array.init.end");
-      CleanupDominator =
-          Builder.CreateStore(BeginPtr.emitRawPointer(*this), EndOfInit);
       pushIrregularPartialArrayCleanup(BeginPtr.emitRawPointer(*this),
                                        EndOfInit, ElementType, ElementAlign,
                                        getDestroyer(DtorKind));
-      Cleanup = EHStack.stable_begin();
+      cast<EHCleanupScope>(*EHStack.find(EHStack.stable_begin()))
+          .AddAuxAllocas(AllocaTracker.Take());
+      DeferredDeactivationCleanupStack.push_back(
+          {EHStack.stable_begin(), DominatingIP});
+      pushedCleanup = true;
     }
 
     CharUnits StartAlign = CurPtr.getAlignment();
@@ -1164,9 +1169,6 @@ void CodeGenFunction::EmitNewArrayInitializer(
   // initialization.
   llvm::ConstantInt *ConstNum = dyn_cast<llvm::ConstantInt>(NumElements);
   if (ConstNum && ConstNum->getZExtValue() <= InitListElements) {
-    // If there was a Cleanup, deactivate it.
-    if (CleanupDominator)
-      DeactivateCleanupBlock(Cleanup, CleanupDominator);
     return;
   }
 
@@ -1281,13 +1283,14 @@ void CodeGenFunction::EmitNewArrayInitializer(
     Builder.CreateStore(CurPtr.emitRawPointer(*this), EndOfInit);
 
   // Enter a partial-destruction Cleanup if necessary.
-  if (!CleanupDominator && needsEHCleanup(DtorKind)) {
-    llvm::Value *BeginPtrRaw = BeginPtr.emitRawPointer(*this);
-    llvm::Value *CurPtrRaw = CurPtr.emitRawPointer(*this);
-    pushRegularPartialArrayCleanup(BeginPtrRaw, CurPtrRaw, ElementType,
+  if (!pushedCleanup && needsEHCleanup(DtorKind)) {
+    llvm::Instruction *DominatingIP =
+        Builder.CreateFlagLoad(llvm::ConstantInt::getNullValue(Int8PtrTy));
+    pushRegularPartialArrayCleanup(BeginPtr.emitRawPointer(*this),
+                                   CurPtr.emitRawPointer(*this), ElementType,
                                    ElementAlign, getDestroyer(DtorKind));
-    Cleanup = EHStack.stable_begin();
-    CleanupDominator = Builder.CreateUnreachable();
+    DeferredDeactivationCleanupStack.push_back(
+        {EHStack.stable_begin(), DominatingIP});
   }
 
   // Emit the initializer into this element.
@@ -1295,10 +1298,7 @@ void CodeGenFunction::EmitNewArrayInitializer(
                           AggValueSlot::DoesNotOverlap);
 
   // Leave the Cleanup if we entered one.
-  if (CleanupDominator) {
-    DeactivateCleanupBlock(Cleanup, CleanupDominator);
-    CleanupDominator->eraseFromParent();
-  }
+  deactivation.ForceDeactivate();
 
   // Advance to the next element by adjusting the pointer type as necessary.
   llvm::Value *NextPtr = Builder.CreateConstInBoundsGEP1_32(
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 40a5cd20c3d7..af48e8d2b839 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2330,7 +2330,7 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     }
 
     // Perform VLAT <-> VLST bitcast through memory.
-    // TODO: since the llvm.experimental.vector.{insert,extract} intrinsics
+    // TODO: since the llvm.vector.{insert,extract} intrinsics
     //       require the element types of the vectors to be the same, we
     //       need to keep this around for bitcasts between VLAT <-> VLST where
     //       the element types of the vectors are not the same, until we figure
diff --git a/clang/lib/CodeGen/CGLoopInfo.cpp b/clang/lib/CodeGen/CGLoopInfo.cpp
index 72d1471021ac..0d4800b90a2f 100644
--- a/clang/lib/CodeGen/CGLoopInfo.cpp
+++ b/clang/lib/CodeGen/CGLoopInfo.cpp
@@ -673,8 +673,6 @@ void LoopInfoStack::push(BasicBlock *Header, clang::ASTContext &Ctx,
         setPipelineDisabled(true);
         break;
       case LoopHintAttr::UnrollCount:
-        setUnrollState(LoopAttributes::Disable);
-        break;
       case LoopHintAttr::UnrollAndJamCount:
       case LoopHintAttr::VectorizeWidth:
       case LoopHintAttr::InterleaveCount:
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 86a6ddd80cc1..87766a758311 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -91,6 +91,8 @@ CodeGenFunction::CodeGenFunction(CodeGenModule &cgm, bool suppressNewContext)
 
 CodeGenFunction::~CodeGenFunction() {
   assert(LifetimeExtendedCleanupStack.empty() && "failed to emit a cleanup");
+  assert(DeferredDeactivationCleanupStack.empty() &&
+         "missed to deactivate a cleanup");
 
   if (getLangOpts().OpenMP && CurFn)
     CGM.getOpenMPRuntime().functionFinished(*this);
@@ -346,6 +348,10 @@ static void EmitIfUsed(CodeGenFunction &CGF, llvm::BasicBlock *BB) {
 void CodeGenFunction::FinishFunction(SourceLocation EndLoc) {
   assert(BreakContinueStack.empty() &&
          "mismatched push/pop in break/continue stack!");
+  assert(LifetimeExtendedCleanupStack.empty() &&
+         "mismatched push/pop of cleanups in EHStack!");
+  assert(DeferredDeactivationCleanupStack.empty() &&
+         "mismatched activate/deactivate of cleanups!");
 
   bool OnlySimpleReturnStmts = NumSimpleReturnExprs > 0
     && NumSimpleReturnExprs == NumReturnExprs
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 33fb7a41912b..6e7417fc7f52 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -39,6 +39,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/SanitizerStats.h"
@@ -670,6 +671,51 @@ public:
 
   EHScopeStack EHStack;
   llvm::SmallVector<char, 256> LifetimeExtendedCleanupStack;
+
+  // A stack of cleanups which were added to EHStack but have to be deactivated
+  // later before being popped or emitted. These are usually deactivated on
+  // exiting a `CleanupDeactivationScope` scope. For instance, after a
+  // full-expr.
+  //
+  // These are specially useful for correctly emitting cleanups while
+  // encountering branches out of expression (through stmt-expr or coroutine
+  // suspensions).
+  struct DeferredDeactivateCleanup {
+    EHScopeStack::stable_iterator Cleanup;
+    llvm::Instruction *DominatingIP;
+  };
+  llvm::SmallVector<DeferredDeactivateCleanup> DeferredDeactivationCleanupStack;
+
+  // Enters a new scope for capturing cleanups which are deferred to be
+  // deactivated, all of which will be deactivated once the scope is exited.
+  struct CleanupDeactivationScope {
+    CodeGenFunction &CGF;
+    size_t OldDeactivateCleanupStackSize;
+    bool Deactivated;
+    CleanupDeactivationScope(CodeGenFunction &CGF)
+        : CGF(CGF), OldDeactivateCleanupStackSize(
+                        CGF.DeferredDeactivationCleanupStack.size()),
+          Deactivated(false) {}
+
+    void ForceDeactivate() {
+      assert(!Deactivated && "Deactivating already deactivated scope");
+      auto &Stack = CGF.DeferredDeactivationCleanupStack;
+      for (size_t I = Stack.size(); I > OldDeactivateCleanupStackSize; I--) {
+        CGF.DeactivateCleanupBlock(Stack[I - 1].Cleanup,
+                                   Stack[I - 1].DominatingIP);
+        Stack[I - 1].DominatingIP->eraseFromParent();
+      }
+      Stack.resize(OldDeactivateCleanupStackSize);
+      Deactivated = true;
+    }
+
+    ~CleanupDeactivationScope() {
+      if (Deactivated)
+        return;
+      ForceDeactivate();
+    }
+  };
+
   llvm::SmallVector<const JumpDest *, 2> SEHTryEpilogueStack;
 
   llvm::Instruction *CurrentFuncletPad = nullptr;
@@ -875,6 +921,19 @@ public:
       new (Buffer + sizeof(Header) + sizeof(T)) RawAddress(ActiveFlag);
   }
 
+  // Push a cleanup onto EHStack and deactivate it later. It is usually
+  // deactivated when exiting a `CleanupDeactivationScope` (for example: after a
+  // full expression).
+  template <class T, class... As>
+  void pushCleanupAndDeferDeactivation(CleanupKind Kind, As... A) {
+    // Placeholder dominating IP for this cleanup.
+    llvm::Instruction *DominatingIP =
+        Builder.CreateFlagLoad(llvm::Constant::getNullValue(Int8PtrTy));
+    EHStack.pushCleanup<T>(Kind, A...);
+    DeferredDeactivationCleanupStack.push_back(
+        {EHStack.stable_begin(), DominatingIP});
+  }
+
   /// Set up the last cleanup that was pushed as a conditional
   /// full-expression cleanup.
   void initFullExprCleanup() {
@@ -898,7 +957,8 @@ public:
 
   /// PopCleanupBlock - Will pop the cleanup entry on the stack and
   /// process all branch fixups.
-  void PopCleanupBlock(bool FallThroughIsBranchThrough = false);
+  void PopCleanupBlock(bool FallThroughIsBranchThrough = false,
+                       bool ForDeactivation = false);
 
   /// DeactivateCleanupBlock - Deactivates the given cleanup block.
   /// The block cannot be reactivated.  Pops it if it's the top of the
@@ -926,6 +986,7 @@ public:
   class RunCleanupsScope {
     EHScopeStack::stable_iterator CleanupStackDepth, OldCleanupScopeDepth;
     size_t LifetimeExtendedCleanupStackSize;
+    CleanupDeactivationScope DeactivateCleanups;
     bool OldDidCallStackSave;
   protected:
     bool PerformCleanup;
@@ -940,8 +1001,7 @@ public:
   public:
     /// Enter a new cleanup scope.
     explicit RunCleanupsScope(CodeGenFunction &CGF)
-      : PerformCleanup(true), CGF(CGF)
-    {
+        : DeactivateCleanups(CGF), PerformCleanup(true), CGF(CGF) {
       CleanupStackDepth = CGF.EHStack.stable_begin();
       LifetimeExtendedCleanupStackSize =
           CGF.LifetimeExtendedCleanupStack.size();
@@ -971,6 +1031,7 @@ public:
     void ForceCleanup(std::initializer_list<llvm::Value**> ValuesToReload = {}) {
       assert(PerformCleanup && "Already forced cleanup");
       CGF.DidCallStackSave = OldDidCallStackSave;
+      DeactivateCleanups.ForceDeactivate();
       CGF.PopCleanupBlocks(CleanupStackDepth, LifetimeExtendedCleanupStackSize,
                            ValuesToReload);
       PerformCleanup = false;
@@ -2160,6 +2221,11 @@ public:
                      Address addr, QualType type);
   void pushDestroy(CleanupKind kind, Address addr, QualType type,
                    Destroyer *destroyer, bool useEHCleanupForArray);
+  void pushDestroyAndDeferDeactivation(QualType::DestructionKind dtorKind,
+                                       Address addr, QualType type);
+  void pushDestroyAndDeferDeactivation(CleanupKind cleanupKind, Address addr,
+                                       QualType type, Destroyer *destroyer,
+                                       bool useEHCleanupForArray);
   void pushLifetimeExtendedDestroy(CleanupKind kind, Address addr,
                                    QualType type, Destroyer *destroyer,
                                    bool useEHCleanupForArray);
@@ -2698,6 +2764,33 @@ public:
                             TBAAAccessInfo *TBAAInfo = nullptr);
   LValue EmitLoadOfPointerLValue(Address Ptr, const PointerType *PtrTy);
 
+private:
+  struct AllocaTracker {
+    void Add(llvm::AllocaInst *I) { Allocas.push_back(I); }
+    llvm::SmallVector<llvm::AllocaInst *> Take() { return std::move(Allocas); }
+
+  private:
+    llvm::SmallVector<llvm::AllocaInst *> Allocas;
+  };
+  AllocaTracker *Allocas = nullptr;
+
+public:
+  // Captures all the allocas created during the scope of its RAII object.
+  struct AllocaTrackerRAII {
+    AllocaTrackerRAII(CodeGenFunction &CGF)
+        : CGF(CGF), OldTracker(CGF.Allocas) {
+      CGF.Allocas = &Tracker;
+    }
+    ~AllocaTrackerRAII() { CGF.Allocas = OldTracker; }
+
+    llvm::SmallVector<llvm::AllocaInst *> Take() { return Tracker.Take(); }
+
+  private:
+    CodeGenFunction &CGF;
+    AllocaTracker *OldTracker;
+    AllocaTracker Tracker;
+  };
+
   /// CreateTempAlloca - This creates an alloca and inserts it into the entry
   /// block if \p ArraySize is nullptr, otherwise inserts it at the current
   /// insertion point of the builder. The caller is responsible for setting an
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index d085e735ecb4..c8898ce196c1 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -73,6 +73,7 @@
 #include "llvm/TargetParser/RISCVISAInfo.h"
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/TargetParser/X86TargetParser.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include <optional>
 
 using namespace clang;
@@ -442,6 +443,11 @@ CodeGenModule::CodeGenModule(ASTContext &C,
       }
     ModuleNameHash = llvm::getUniqueInternalLinkagePostfix(Path);
   }
+
+  // Record mregparm value now so it is visible through all of codegen.
+  if (Context.getTargetInfo().getTriple().getArch() == llvm::Triple::x86)
+    getModule().addModuleFlag(llvm::Module::Error, "NumRegisterParameters",
+                              CodeGenOpts.NumRegisterParameters);
 }
 
 CodeGenModule::~CodeGenModule() {}
@@ -980,11 +986,6 @@ void CodeGenModule::Release() {
       NMD->addOperand(MD);
   }
 
-  // Record mregparm value now so it is visible through rest of codegen.
-  if (Context.getTargetInfo().getTriple().getArch() == llvm::Triple::x86)
-    getModule().addModuleFlag(llvm::Module::Error, "NumRegisterParameters",
-                              CodeGenOpts.NumRegisterParameters);
-
   if (CodeGenOpts.DwarfVersion) {
     getModule().addModuleFlag(llvm::Module::Max, "Dwarf Version",
                               CodeGenOpts.DwarfVersion);
@@ -4781,6 +4782,10 @@ CodeGenModule::CreateRuntimeFunction(llvm::FunctionType *FTy, StringRef Name,
         }
       }
       setDSOLocal(F);
+      // FIXME: We should use CodeGenModule::SetLLVMFunctionAttributes() instead
+      // of trying to approximate the attributes using the LLVM function
+      // signature. This requires revising the API of CreateRuntimeFunction().
+      markRegisterParameterAttributes(F);
     }
   }
 
diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
index d38a26940a3c..d47927745759 100644
--- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp
+++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -1131,9 +1131,15 @@ static bool isTrivialForMSVC(const CXXRecordDecl *RD, QualType Ty,
     return false;
   if (RD->hasNonTrivialCopyAssignment())
     return false;
-  for (const CXXConstructorDecl *Ctor : RD->ctors())
-    if (Ctor->isUserProvided())
-      return false;
+  for (const Decl *D : RD->decls()) {
+    if (auto *Ctor = dyn_cast<CXXConstructorDecl>(D)) {
+      if (Ctor->isUserProvided())
+        return false;
+    } else if (auto *Template = dyn_cast<FunctionTemplateDecl>(D)) {
+      if (isa<CXXConstructorDecl>(Template->getTemplatedDecl()))
+        return false;
+    }
+  }
   if (RD->hasNonTrivialDestructor())
     return false;
   return true;
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 76b7b9fdfb4f..114320f5d314 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -361,6 +361,7 @@ phases::ID Driver::getFinalPhase(const DerivedArgList &DAL,
              (PhaseArg = DAL.getLastArg(options::OPT_rewrite_legacy_objc)) ||
              (PhaseArg = DAL.getLastArg(options::OPT__migrate)) ||
              (PhaseArg = DAL.getLastArg(options::OPT__analyze)) ||
+             (PhaseArg = DAL.getLastArg(options::OPT_emit_cir)) ||
              (PhaseArg = DAL.getLastArg(options::OPT_emit_ast))) {
     FinalPhase = phases::Compile;
 
@@ -4799,6 +4800,8 @@ Action *Driver::ConstructPhaseAction(
       return C.MakeAction<MigrateJobAction>(Input, types::TY_Remap);
     if (Args.hasArg(options::OPT_emit_ast))
       return C.MakeAction<CompileJobAction>(Input, types::TY_AST);
+    if (Args.hasArg(options::OPT_emit_cir))
+      return C.MakeAction<CompileJobAction>(Input, types::TY_CIR);
     if (Args.hasArg(options::OPT_module_file_info))
       return C.MakeAction<CompileJobAction>(Input, types::TY_ModuleFile);
     if (Args.hasArg(options::OPT_verify_pch))
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 341d6202a9ca..0e86bc07e0ea 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1316,14 +1316,19 @@ bool ToolChain::isFastMathRuntimeAvailable(const ArgList &Args,
   // (to keep the linker options consistent with gcc and clang itself).
   if (Default && !isOptimizationLevelFast(Args)) {
     // Check if -ffast-math or -funsafe-math.
-    Arg *A =
-      Args.getLastArg(options::OPT_ffast_math, options::OPT_fno_fast_math,
-                      options::OPT_funsafe_math_optimizations,
-                      options::OPT_fno_unsafe_math_optimizations);
+    Arg *A = Args.getLastArg(
+        options::OPT_ffast_math, options::OPT_fno_fast_math,
+        options::OPT_funsafe_math_optimizations,
+        options::OPT_fno_unsafe_math_optimizations, options::OPT_ffp_model_EQ);
 
     if (!A || A->getOption().getID() == options::OPT_fno_fast_math ||
         A->getOption().getID() == options::OPT_fno_unsafe_math_optimizations)
       Default = false;
+    if (A && A->getOption().getID() == options::OPT_ffp_model_EQ) {
+      StringRef Model = A->getValue();
+      if (Model != "fast")
+        Default = false;
+    }
   }
 
   // Whatever decision came as a result of the above implicit settings, either
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 651a2b5aac36..250dc078edf2 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -2743,13 +2743,11 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
   StringRef FPExceptionBehavior = "";
   // -ffp-eval-method options: double, extended, source
   StringRef FPEvalMethod = "";
-  const llvm::DenormalMode DefaultDenormalFPMath =
+  llvm::DenormalMode DenormalFPMath =
       TC.getDefaultDenormalModeForType(Args, JA);
-  const llvm::DenormalMode DefaultDenormalFP32Math =
+  llvm::DenormalMode DenormalFP32Math =
       TC.getDefaultDenormalModeForType(Args, JA, &llvm::APFloat::IEEEsingle());
 
-  llvm::DenormalMode DenormalFPMath = DefaultDenormalFPMath;
-  llvm::DenormalMode DenormalFP32Math = DefaultDenormalFP32Math;
   // CUDA and HIP don't rely on the frontend to pass an ffp-contract option.
   // If one wasn't given by the user, don't pass it here.
   StringRef FPContract;
@@ -2899,11 +2897,6 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       SignedZeros = true;
       // -fno_fast_math restores default denormal and fpcontract handling
       FPContract = "on";
-      DenormalFPMath = llvm::DenormalMode::getIEEE();
-
-      // FIXME: The target may have picked a non-IEEE default mode here based on
-      // -cl-denorms-are-zero. Should the target consider -fp-model interaction?
-      DenormalFP32Math = llvm::DenormalMode::getIEEE();
 
       StringRef Val = A->getValue();
       if (OFastEnabled && !Val.equals("fast")) {
@@ -3122,9 +3115,6 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       TrappingMath = true;
       FPExceptionBehavior = "strict";
 
-      // The target may have opted to flush by default, so force IEEE.
-      DenormalFPMath = llvm::DenormalMode::getIEEE();
-      DenormalFP32Math = llvm::DenormalMode::getIEEE();
       if (!JA.isDeviceOffloading(Action::OFK_Cuda) &&
           !JA.isOffloading(Action::OFK_HIP)) {
         if (LastSeenFfpContractOption != "") {
@@ -3154,9 +3144,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       ReciprocalMath = false;
       ApproxFunc = false;
       SignedZeros = true;
-      // -fno_fast_math restores default denormal and fpcontract handling
-      DenormalFPMath = DefaultDenormalFPMath;
-      DenormalFP32Math = llvm::DenormalMode::getIEEE();
+      // -fno_fast_math restores default fpcontract handling
       if (!JA.isDeviceOffloading(Action::OFK_Cuda) &&
           !JA.isOffloading(Action::OFK_HIP)) {
         if (LastSeenFfpContractOption != "") {
@@ -3171,8 +3159,6 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       // subsequent options conflict then emit warning diagnostic.
       if (HonorINFs && HonorNaNs && !AssociativeMath && !ReciprocalMath &&
           SignedZeros && TrappingMath && RoundingFPMath && !ApproxFunc &&
-          DenormalFPMath == llvm::DenormalMode::getIEEE() &&
-          DenormalFP32Math == llvm::DenormalMode::getIEEE() &&
           FPContract.equals("off"))
         // OK: Current Arg doesn't conflict with -ffp-model=strict
         ;
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index b65b96db16bd..fec11c7e716f 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1191,118 +1191,10 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs,
   return true;
 }
 
-/// Determines if --whole-archive is active in the list of arguments.
-static bool isWholeArchivePresent(const ArgList &Args) {
-  bool WholeArchiveActive = false;
-  for (auto *Arg : Args.filtered(options::OPT_Wl_COMMA)) {
-    if (Arg) {
-      for (StringRef ArgValue : Arg->getValues()) {
-        if (ArgValue == "--whole-archive")
-          WholeArchiveActive = true;
-        if (ArgValue == "--no-whole-archive")
-          WholeArchiveActive = false;
-      }
-    }
-  }
-
-  return WholeArchiveActive;
-}
-
-/// Determine if driver is invoked to create a shared object library (-static)
-static bool isSharedLinkage(const ArgList &Args) {
-  return Args.hasArg(options::OPT_shared);
-}
-
-/// Determine if driver is invoked to create a static object library (-shared)
-static bool isStaticLinkage(const ArgList &Args) {
-  return Args.hasArg(options::OPT_static);
-}
-
-/// Add Fortran runtime libs for MSVC
-static void addFortranRuntimeLibsMSVC(const ArgList &Args,
-                                      llvm::opt::ArgStringList &CmdArgs) {
-  unsigned RTOptionID = options::OPT__SLASH_MT;
-  if (auto *rtl = Args.getLastArg(options::OPT_fms_runtime_lib_EQ)) {
-    RTOptionID = llvm::StringSwitch<unsigned>(rtl->getValue())
-                     .Case("static", options::OPT__SLASH_MT)
-                     .Case("static_dbg", options::OPT__SLASH_MTd)
-                     .Case("dll", options::OPT__SLASH_MD)
-                     .Case("dll_dbg", options::OPT__SLASH_MDd)
-                     .Default(options::OPT__SLASH_MT);
-  }
-  switch (RTOptionID) {
-  case options::OPT__SLASH_MT:
-    CmdArgs.push_back("/WHOLEARCHIVE:Fortran_main.static.lib");
-    break;
-  case options::OPT__SLASH_MTd:
-    CmdArgs.push_back("/WHOLEARCHIVE:Fortran_main.static_dbg.lib");
-    break;
-  case options::OPT__SLASH_MD:
-    CmdArgs.push_back("/WHOLEARCHIVE:Fortran_main.dynamic.lib");
-    break;
-  case options::OPT__SLASH_MDd:
-    CmdArgs.push_back("/WHOLEARCHIVE:Fortran_main.dynamic_dbg.lib");
-    break;
-  }
-}
-
-// Add FortranMain runtime lib
-static void addFortranMain(const ToolChain &TC, const ArgList &Args,
-                           llvm::opt::ArgStringList &CmdArgs) {
-  // 0. Shared-library linkage
-  // If we are attempting to link a library, we should not add
-  // -lFortran_main.a to the link line, as the `main` symbol is not
-  // required for a library and should also be provided by one of
-  // the translation units of the code that this shared library
-  // will be linked against eventually.
-  if (isSharedLinkage(Args) || isStaticLinkage(Args)) {
-    return;
-  }
-
-  // 1. MSVC
-  if (TC.getTriple().isKnownWindowsMSVCEnvironment()) {
-    addFortranRuntimeLibsMSVC(Args, CmdArgs);
-    return;
-  }
-
-  // 2. GNU and similar
-  const Driver &D = TC.getDriver();
-  const char *FortranMainLinkFlag = "-lFortran_main";
-
-  // Warn if the user added `-lFortran_main` - this library is an implementation
-  // detail of Flang and should be handled automaticaly by the driver.
-  for (const char *arg : CmdArgs) {
-    if (strncmp(arg, FortranMainLinkFlag, strlen(FortranMainLinkFlag)) == 0)
-      D.Diag(diag::warn_drv_deprecated_custom)
-          << FortranMainLinkFlag
-          << "see the Flang driver documentation for correct usage";
-  }
-
-  // The --whole-archive option needs to be part of the link line to make
-  // sure that the main() function from Fortran_main.a is pulled in by the
-  // linker. However, it shouldn't be used if it's already active.
-  // TODO: Find an equivalent of `--whole-archive` for Darwin and AIX.
-  if (!isWholeArchivePresent(Args) && !TC.getTriple().isMacOSX() &&
-      !TC.getTriple().isOSAIX()) {
-    CmdArgs.push_back("--whole-archive");
-    CmdArgs.push_back(FortranMainLinkFlag);
-    CmdArgs.push_back("--no-whole-archive");
-    return;
-  }
-
-  CmdArgs.push_back(FortranMainLinkFlag);
-}
-
 /// Add Fortran runtime libs
 void tools::addFortranRuntimeLibs(const ToolChain &TC, const ArgList &Args,
                                   llvm::opt::ArgStringList &CmdArgs) {
-  // 1. Link FortranMain
-  // FortranMain depends on FortranRuntime, so needs to be listed first. If
-  // -fno-fortran-main has been passed, skip linking Fortran_main.a
-  if (!Args.hasArg(options::OPT_no_fortran_main))
-    addFortranMain(TC, Args, CmdArgs);
-
-  // 2. Link FortranRuntime and FortranDecimal
+  // Link FortranRuntime and FortranDecimal
   // These are handled earlier on Windows by telling the frontend driver to
   // add the correct libraries to link against as dependents in the object
   // file.
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 6d93c1f3d703..8955b9fb653c 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -282,7 +282,6 @@ static void processVSRuntimeLibrary(const ToolChain &TC, const ArgList &Args,
   assert(TC.getTriple().isKnownWindowsMSVCEnvironment() &&
          "can only add VS runtime library on Windows!");
   // if -fno-fortran-main has been passed, skip linking Fortran_main.a
-  bool LinkFortranMain = !Args.hasArg(options::OPT_no_fortran_main);
   if (TC.getTriple().isKnownWindowsMSVCEnvironment()) {
     CmdArgs.push_back(Args.MakeArgString(
         "--dependent-lib=" + TC.getCompilerRTBasename(Args, "builtins")));
@@ -300,8 +299,6 @@ static void processVSRuntimeLibrary(const ToolChain &TC, const ArgList &Args,
   case options::OPT__SLASH_MT:
     CmdArgs.push_back("-D_MT");
     CmdArgs.push_back("--dependent-lib=libcmt");
-    if (LinkFortranMain)
-      CmdArgs.push_back("--dependent-lib=Fortran_main.static.lib");
     CmdArgs.push_back("--dependent-lib=FortranRuntime.static.lib");
     CmdArgs.push_back("--dependent-lib=FortranDecimal.static.lib");
     break;
@@ -309,8 +306,6 @@ static void processVSRuntimeLibrary(const ToolChain &TC, const ArgList &Args,
     CmdArgs.push_back("-D_MT");
     CmdArgs.push_back("-D_DEBUG");
     CmdArgs.push_back("--dependent-lib=libcmtd");
-    if (LinkFortranMain)
-      CmdArgs.push_back("--dependent-lib=Fortran_main.static_dbg.lib");
     CmdArgs.push_back("--dependent-lib=FortranRuntime.static_dbg.lib");
     CmdArgs.push_back("--dependent-lib=FortranDecimal.static_dbg.lib");
     break;
@@ -318,8 +313,6 @@ static void processVSRuntimeLibrary(const ToolChain &TC, const ArgList &Args,
     CmdArgs.push_back("-D_MT");
     CmdArgs.push_back("-D_DLL");
     CmdArgs.push_back("--dependent-lib=msvcrt");
-    if (LinkFortranMain)
-      CmdArgs.push_back("--dependent-lib=Fortran_main.dynamic.lib");
     CmdArgs.push_back("--dependent-lib=FortranRuntime.dynamic.lib");
     CmdArgs.push_back("--dependent-lib=FortranDecimal.dynamic.lib");
     break;
@@ -328,8 +321,6 @@ static void processVSRuntimeLibrary(const ToolChain &TC, const ArgList &Args,
     CmdArgs.push_back("-D_DEBUG");
     CmdArgs.push_back("-D_DLL");
     CmdArgs.push_back("--dependent-lib=msvcrtd");
-    if (LinkFortranMain)
-      CmdArgs.push_back("--dependent-lib=Fortran_main.dynamic_dbg.lib");
     CmdArgs.push_back("--dependent-lib=FortranRuntime.dynamic_dbg.lib");
     CmdArgs.push_back("--dependent-lib=FortranDecimal.dynamic_dbg.lib");
     break;
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 8236051e30c4..8312abc36039 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -2549,6 +2549,7 @@ static const auto &getFrontendActionTable() {
       {frontend::DumpTokens, OPT_dump_tokens},
       {frontend::EmitAssembly, OPT_S},
       {frontend::EmitBC, OPT_emit_llvm_bc},
+      {frontend::EmitCIR, OPT_emit_cir},
       {frontend::EmitHTML, OPT_emit_html},
       {frontend::EmitLLVM, OPT_emit_llvm},
       {frontend::EmitLLVMOnly, OPT_emit_llvm_only},
@@ -2891,6 +2892,8 @@ static bool ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args,
   if (Opts.ProgramAction != frontend::GenerateModule && Opts.IsSystemModule)
     Diags.Report(diag::err_drv_argument_only_allowed_with) << "-fsystem-module"
                                                            << "-emit-module";
+  if (Args.hasArg(OPT_fclangir) || Args.hasArg(OPT_emit_cir))
+    Opts.UseClangIRPipeline = true;
 
   if (Args.hasArg(OPT_aux_target_cpu))
     Opts.AuxTargetCPU = std::string(Args.getLastArgValue(OPT_aux_target_cpu));
@@ -4337,6 +4340,7 @@ static bool isStrictlyPreprocessorAction(frontend::ActionKind Action) {
   case frontend::ASTView:
   case frontend::EmitAssembly:
   case frontend::EmitBC:
+  case frontend::EmitCIR:
   case frontend::EmitHTML:
   case frontend::EmitLLVM:
   case frontend::EmitLLVMOnly:
diff --git a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
index f85f0365616f..7476b1076d10 100644
--- a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
+++ b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
@@ -53,6 +53,8 @@ CreateFrontendBaseAction(CompilerInstance &CI) {
   case DumpTokens:             return std::make_unique<DumpTokensAction>();
   case EmitAssembly:           return std::make_unique<EmitAssemblyAction>();
   case EmitBC:                 return std::make_unique<EmitBCAction>();
+  case EmitCIR:
+    llvm_unreachable("CIR suppport not built into clang");
   case EmitHTML:               return std::make_unique<HTMLPrintAction>();
   case EmitLLVM:               return std::make_unique<EmitLLVMAction>();
   case EmitLLVMOnly:           return std::make_unique<EmitLLVMOnlyAction>();
diff --git a/clang/lib/Lex/Pragma.cpp b/clang/lib/Lex/Pragma.cpp
index 499813f8ab7d..10f0ab7180e6 100644
--- a/clang/lib/Lex/Pragma.cpp
+++ b/clang/lib/Lex/Pragma.cpp
@@ -1444,7 +1444,8 @@ struct PragmaWarningHandler : public PragmaHandler {
                                  .Case("once", PPCallbacks::PWS_Once)
                                  .Case("suppress", PPCallbacks::PWS_Suppress)
                                  .Default(-1);
-          if ((SpecifierValid = SpecifierInt != -1))
+          SpecifierValid = SpecifierInt != -1;
+          if (SpecifierValid)
             Specifier =
                 static_cast<PPCallbacks::PragmaWarningSpecifier>(SpecifierInt);
 
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index e33113ab9c4c..cf8840c63024 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3164,13 +3164,20 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
 
     const Expr *Arg = TheCall->getArg(0);
     const auto *TyA = Arg->getType()->getAs<VectorType>();
-    if (!TyA) {
+
+    QualType ElTy;
+    if (TyA)
+      ElTy = TyA->getElementType();
+    else if (Arg->getType()->isSizelessVectorType())
+      ElTy = Arg->getType()->getSizelessVectorEltType(Context);
+
+    if (ElTy.isNull()) {
       Diag(Arg->getBeginLoc(), diag::err_builtin_invalid_arg_type)
           << 1 << /* vector ty*/ 4 << Arg->getType();
       return ExprError();
     }
 
-    TheCall->setType(TyA->getElementType());
+    TheCall->setType(ElTy);
     break;
   }
 
@@ -3186,12 +3193,20 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
 
     const Expr *Arg = TheCall->getArg(0);
     const auto *TyA = Arg->getType()->getAs<VectorType>();
-    if (!TyA || !TyA->getElementType()->isIntegerType()) {
+
+    QualType ElTy;
+    if (TyA)
+      ElTy = TyA->getElementType();
+    else if (Arg->getType()->isSizelessVectorType())
+      ElTy = Arg->getType()->getSizelessVectorEltType(Context);
+
+    if (ElTy.isNull() || !ElTy->isIntegerType()) {
       Diag(Arg->getBeginLoc(), diag::err_builtin_invalid_arg_type)
           << 1  << /* vector of integers */ 6 << Arg->getType();
       return ExprError();
     }
-    TheCall->setType(TyA->getElementType());
+
+    TheCall->setType(ElTy);
     break;
   }
 
@@ -12544,6 +12559,17 @@ CheckPrintfHandler::checkFormatExpr(const analyze_printf::PrintfSpecifier &FS,
     return true;
   }
 
+  // Diagnose attempts to use '%P' with ObjC object types, which will result in
+  // dumping raw class data (like is-a pointer), not actual data.
+  if (FS.getConversionSpecifier().getKind() == ConversionSpecifier::PArg &&
+      ExprTy->isObjCObjectPointerType()) {
+    const CharSourceRange &CSR =
+        getSpecifierRange(StartSpecifier, SpecifierLen);
+    EmitFormatDiagnostic(S.PDiag(diag::warn_format_P_with_objc_pointer),
+                         E->getExprLoc(), false, CSR);
+    return true;
+  }
+
   ArgType::MatchKind ImplicitMatch = ArgType::NoMatch;
   ArgType::MatchKind Match = AT.matchesType(S.Context, ExprTy);
   ArgType::MatchKind OrigMatch = Match;
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 1a71a37c0732..338b0ec1e099 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -12054,11 +12054,17 @@ bool Sema::isStdInitializerList(QualType Ty, QualType *Element) {
 
     Template = Specialization->getSpecializedTemplate();
     Arguments = Specialization->getTemplateArgs().data();
-  } else if (const TemplateSpecializationType *TST =
-                 Ty->getAs<TemplateSpecializationType>()) {
-    Template = dyn_cast_or_null<ClassTemplateDecl>(
-        TST->getTemplateName().getAsTemplateDecl());
-    Arguments = TST->template_arguments().begin();
+  } else {
+    const TemplateSpecializationType *TST = nullptr;
+    if (auto *ICN = Ty->getAs<InjectedClassNameType>())
+      TST = ICN->getInjectedTST();
+    else
+      TST = Ty->getAs<TemplateSpecializationType>();
+    if (TST) {
+      Template = dyn_cast_or_null<ClassTemplateDecl>(
+          TST->getTemplateName().getAsTemplateDecl());
+      Arguments = TST->template_arguments().begin();
+    }
   }
   if (!Template)
     return false;
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 003a157990d3..7d9eaf672046 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -8340,8 +8340,17 @@ void Sema::checkInitializerLifetime(const InitializedEntity &Entity,
             << Entity.getType()->isReferenceType() << CLE->getInitializer() << 2
             << DiagRange;
       } else {
-        Diag(DiagLoc, diag::warn_ret_local_temp_addr_ref)
-         << Entity.getType()->isReferenceType() << DiagRange;
+        // P2748R5: Disallow Binding a Returned Glvalue to a Temporary.
+        // [stmt.return]/p6: In a function whose return type is a reference,
+        // other than an invented function for std::is_convertible ([meta.rel]),
+        // a return statement that binds the returned reference to a temporary
+        // expression ([class.temporary]) is ill-formed.
+        if (getLangOpts().CPlusPlus26 && Entity.getType()->isReferenceType())
+          Diag(DiagLoc, diag::err_ret_local_temp_ref)
+              << Entity.getType()->isReferenceType() << DiagRange;
+        else
+          Diag(DiagLoc, diag::warn_ret_local_temp_addr_ref)
+              << Entity.getType()->isReferenceType() << DiagRange;
       }
       break;
     }
@@ -10790,8 +10799,6 @@ QualType Sema::DeduceTemplateSpecializationFromInitializer(
   // FIXME: Perform "exact type" matching first, per CWG discussion?
   //        Or implement this via an implied 'T(T) -> T' deduction guide?
 
-  // FIXME: Do we need/want a std::initializer_list<T> special case?
-
   // Look up deduction guides, including those synthesized from constructors.
   //
   // C++1z [over.match.class.deduct]p1:
diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp
index 9d44c22c8ddc..1c84830b6ddd 100644
--- a/clang/lib/Sema/SemaStmtAttr.cpp
+++ b/clang/lib/Sema/SemaStmtAttr.cpp
@@ -109,16 +109,14 @@ static Attr *handleLoopHintAttr(Sema &S, Stmt *St, const ParsedAttr &A,
     SetHints(LoopHintAttr::Unroll, LoopHintAttr::Disable);
   } else if (PragmaName == "unroll") {
     // #pragma unroll N
-    if (ValueExpr && !ValueExpr->isValueDependent()) {
-      llvm::APSInt ValueAPS;
-      ExprResult R = S.VerifyIntegerConstantExpression(ValueExpr, &ValueAPS);
-      assert(!R.isInvalid() && "unroll count value must be a valid value, it's "
-                               "should be checked in Sema::CheckLoopHintExpr");
-      (void)R;
-      // The values of 0 and 1 block any unrolling of the loop.
-      if (ValueAPS.isZero() || ValueAPS.isOne())
-        SetHints(LoopHintAttr::UnrollCount, LoopHintAttr::Disable);
-      else
+    if (ValueExpr) {
+      if (!ValueExpr->isValueDependent()) {
+        auto Value = ValueExpr->EvaluateKnownConstInt(S.getASTContext());
+        if (Value.isZero() || Value.isOne())
+          SetHints(LoopHintAttr::Unroll, LoopHintAttr::Disable);
+        else
+          SetHints(LoopHintAttr::UnrollCount, LoopHintAttr::Numeric);
+      } else
         SetHints(LoopHintAttr::UnrollCount, LoopHintAttr::Numeric);
     } else
       SetHints(LoopHintAttr::Unroll, LoopHintAttr::Enable);
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 98d5c7cb3a8a..3a9fd906b7af 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -2151,13 +2151,25 @@ TemplateInstantiator::TransformLoopHintAttr(const LoopHintAttr *LH) {
 
   // Generate error if there is a problem with the value.
   if (getSema().CheckLoopHintExpr(TransformedExpr, LH->getLocation(),
-                                  LH->getOption() == LoopHintAttr::UnrollCount))
+                                  LH->getSemanticSpelling() ==
+                                      LoopHintAttr::Pragma_unroll))
     return LH;
 
+  LoopHintAttr::OptionType Option = LH->getOption();
+  LoopHintAttr::LoopHintState State = LH->getState();
+
+  llvm::APSInt ValueAPS =
+      TransformedExpr->EvaluateKnownConstInt(getSema().getASTContext());
+  // The values of 0 and 1 block any unrolling of the loop.
+  if (ValueAPS.isZero() || ValueAPS.isOne()) {
+    Option = LoopHintAttr::Unroll;
+    State = LoopHintAttr::Disable;
+  }
+
   // Create new LoopHintValueAttr with integral expression in place of the
   // non-type template parameter.
-  return LoopHintAttr::CreateImplicit(getSema().Context, LH->getOption(),
-                                      LH->getState(), TransformedExpr, *LH);
+  return LoopHintAttr::CreateImplicit(getSema().Context, Option, State,
+                                      TransformedExpr, *LH);
 }
 const NoInlineAttr *TemplateInstantiator::TransformStmtNoInlineAttr(
     const Stmt *OrigS, const Stmt *InstS, const NoInlineAttr *A) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
index 741f33676158..ae494de58da3 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
@@ -53,10 +53,11 @@ public:
       bool shouldVisitTemplateInstantiations() const { return true; }
       bool shouldVisitImplicitCode() const { return false; }
 
-      bool TraverseDecl(Decl *D) {
-        if (isa<ClassTemplateDecl>(D) && isRefType(safeGetName(D)))
+      bool TraverseClassTemplateDecl(ClassTemplateDecl *Decl) {
+        if (isRefType(safeGetName(Decl)))
           return true;
-        return RecursiveASTVisitor<LocalVisitor>::TraverseDecl(D);
+        return RecursiveASTVisitor<LocalVisitor>::TraverseClassTemplateDecl(
+            Decl);
       }
 
       bool VisitCallExpr(const CallExpr *CE) {
diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c
index a5951158ed0e..207da5fe8126 100644
--- a/clang/test/AST/Interp/c.c
+++ b/clang/test/AST/Interp/c.c
@@ -263,3 +263,10 @@ const int *p = &b;
 const __int128 K = (__int128)(int*)0;
 const unsigned __int128 KU = (unsigned __int128)(int*)0;
 #endif
+
+
+int test3(void) {
+  int a[2];
+  a[0] = test3; // all-error {{incompatible pointer to integer conversion assigning to 'int' from 'int (void)'}}
+  return 0;
+}
diff --git a/clang/test/AST/Interp/functions.cpp b/clang/test/AST/Interp/functions.cpp
index f9bb5d53634e..a5bb9f1a19aa 100644
--- a/clang/test/AST/Interp/functions.cpp
+++ b/clang/test/AST/Interp/functions.cpp
@@ -601,3 +601,19 @@ namespace FromIntegral {
                                            // both-warning {{variable length arrays}}
 #endif
 }
+
+namespace {
+  template <typename T> using id = T;
+  template <typename T>
+  constexpr void g() {
+    constexpr id<void (T)> f;
+  }
+
+  static_assert((g<int>(), true), "");
+}
+
+namespace {
+  /// The InitListExpr here is of void type.
+  void bir [[clang::annotate("B", {1, 2, 3, 4})]] (); // both-error {{'annotate' attribute requires parameter 1 to be a constant expression}} \
+                                                      // both-note {{subexpression not valid in a constant expression}}
+}
diff --git a/clang/test/AST/Interp/opencl.cl b/clang/test/AST/Interp/opencl.cl
new file mode 100644
index 000000000000..b9ba4f8b9b55
--- /dev/null
+++ b/clang/test/AST/Interp/opencl.cl
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -fsyntax-only -verify=ref,both %s
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,both %s -fexperimental-new-constant-interpreter
+
+// both-no-diagnostics
+
+typedef int int2 __attribute__((ext_vector_type(2)));
+typedef int int3 __attribute__((ext_vector_type(3)));
+typedef int int4 __attribute__((ext_vector_type(4)));
+typedef int int8 __attribute__((ext_vector_type(8)));
+typedef int int16 __attribute__((ext_vector_type(16)));
+
+void foo(int3 arg1, int8 arg2) {
+  int4 auto1;
+  int16 *auto2;
+  int auto3;
+  int2 auto4;
+  struct S *incomplete1;
+
+  int res1[vec_step(arg1) == 4 ? 1 : -1];
+  int res2[vec_step(arg2) == 8 ? 1 : -1];
+  int res3[vec_step(auto1) == 4 ? 1 : -1];
+  int res4[vec_step(*auto2) == 16 ? 1 : -1];
+  int res5[vec_step(auto3) == 1 ? 1 : -1];
+  int res6[vec_step(auto4) == 2 ? 1 : -1];
+  int res7[vec_step(int2) == 2 ? 1 : -1];
+  int res8[vec_step(int3) == 4 ? 1 : -1];
+  int res9[vec_step(int4) == 4 ? 1 : -1];
+  int res10[vec_step(int8) == 8 ? 1 : -1];
+  int res11[vec_step(int16) == 16 ? 1 : -1];
+  int res12[vec_step(void) == 1 ? 1 : -1];
+}
+
diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp
index 9307b9c090c5..771e5adfca34 100644
--- a/clang/test/AST/Interp/records.cpp
+++ b/clang/test/AST/Interp/records.cpp
@@ -90,8 +90,7 @@ struct Ints2 {
   int a = 10;
   int b;
 };
-constexpr Ints2 ints22; // both-error {{without a user-provided default constructor}} \
-                        // expected-error {{must be initialized by a constant expression}}
+constexpr Ints2 ints22; // both-error {{without a user-provided default constructor}}
 
 constexpr Ints2 I2 = Ints2{12, 25};
 static_assert(I2.a == 12, "");
@@ -1031,6 +1030,12 @@ namespace ParenInit {
                      // both-note {{required by 'constinit' specifier}} \
                      // both-note {{reference to temporary is not a constant expression}} \
                      // both-note {{temporary created here}}
+
+
+  /// Initializing an array.
+  constexpr void bar(int i, int j) {
+    int arr[4](i, j);
+  }
 }
 #endif
 
@@ -1409,3 +1414,29 @@ namespace VirtualBases {
     static_assert((X*)(Y1*)&z != (X*)(Y2*)&z, "");
   }
 }
+
+namespace ZeroInit {
+  struct S3 {
+    S3() = default;
+    S3(const S3&) = default;
+    S3(S3&&) = default;
+    constexpr S3(int n) : n(n) {}
+    int n;
+  };
+  constexpr S3 s3d; // both-error {{default initialization of an object of const type 'const S3' without a user-provided default constructor}}
+  static_assert(s3d.n == 0, "");
+}
+
+namespace {
+#if __cplusplus >= 202002L
+  struct C {
+    template <unsigned N> constexpr C(const char (&)[N]) : n(N) {}
+    unsigned n;
+  };
+  template <C c>
+  constexpr auto operator""_c() { return c.n; }
+
+  constexpr auto waldo = "abc"_c;
+  static_assert(waldo == 4, "");
+#endif
+}
diff --git a/clang/test/AST/ast-dump-pragma-unroll.cpp b/clang/test/AST/ast-dump-pragma-unroll.cpp
new file mode 100644
index 000000000000..f9c254b803ff
--- /dev/null
+++ b/clang/test/AST/ast-dump-pragma-unroll.cpp
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -ast-dump %s | FileCheck %s
+
+using size_t = unsigned long long;
+
+// CHECK: LoopHintAttr {{.*}} Implicit unroll UnrollCount Numeric
+// CHECK: LoopHintAttr {{.*}} Implicit unroll UnrollCount Numeric
+// CHECK: LoopHintAttr {{.*}} Implicit unroll Unroll Disable
+// CHECK: LoopHintAttr {{.*}} Implicit unroll Unroll Disable
+template <bool Flag>
+int value_dependent(int n) {
+  constexpr int N = 100;
+  auto init = [=]() { return Flag ? n : 0UL; };
+  auto cond = [=](size_t ix) { return Flag ? ix != 0 : ix < 10; };
+  auto iter = [=](size_t ix) {
+    return Flag ? ix & ~(1ULL << __builtin_clzll(ix)) : ix + 1;
+  };
+
+#pragma unroll Flag ? 1 : N
+  for (size_t ix = init(); cond(ix); ix = iter(ix)) {
+    n *= n;
+  }
+#pragma unroll Flag ? 0 : N
+  for (size_t ix = init(); cond(ix); ix = iter(ix)) {
+    n *= n;
+  }
+  return n;
+}
+
+void test_value_dependent(int n) {
+  value_dependent<true>(n);
+}
diff --git a/clang/test/AST/ast-dump-template-json-win32-mangler-crash.cpp b/clang/test/AST/ast-dump-template-json-win32-mangler-crash.cpp
index cf740516db6f..5ac55d269dce 100644
--- a/clang/test/AST/ast-dump-template-json-win32-mangler-crash.cpp
+++ b/clang/test/AST/ast-dump-template-json-win32-mangler-crash.cpp
@@ -1846,6 +1846,42 @@ int main()
 // CHECK-NEXT:              "kind": "VarTemplateDecl",
 // CHECK-NEXT:              "name": "is_const_v"
 // CHECK-NEXT:             }
+// CHECK-NEXT:            ],
+// CHECK-NEXT:            "inner": [
+// CHECK-NEXT:             {
+// CHECK-NEXT:              "kind": "TemplateArgument",
+// CHECK-NEXT:              "type": {
+// CHECK-NEXT:               "qualType": "const _Ty"
+// CHECK-NEXT:              },
+// CHECK-NEXT:              "inner": [
+// CHECK-NEXT:               {
+// CHECK-NEXT:                "id": "0x{{.*}}",
+// CHECK-NEXT:                "kind": "QualType",
+// CHECK-NEXT:                "type": {
+// CHECK-NEXT:                 "qualType": "const _Ty"
+// CHECK-NEXT:                },
+// CHECK-NEXT:                "qualifiers": "const",
+// CHECK-NEXT:                "inner": [
+// CHECK-NEXT:                 {
+// CHECK-NEXT:                  "id": "0x{{.*}}",
+// CHECK-NEXT:                  "kind": "TemplateTypeParmType",
+// CHECK-NEXT:                  "type": {
+// CHECK-NEXT:                   "qualType": "_Ty"
+// CHECK-NEXT:                  },
+// CHECK-NEXT:                  "isDependent": true,
+// CHECK-NEXT:                  "isInstantiationDependent": true,
+// CHECK-NEXT:                  "depth": 0,
+// CHECK-NEXT:                  "index": 0,
+// CHECK-NEXT:                  "decl": {
+// CHECK-NEXT:                   "id": "0x{{.*}}",
+// CHECK-NEXT:                   "kind": "TemplateTypeParmDecl",
+// CHECK-NEXT:                   "name": "_Ty"
+// CHECK-NEXT:                  }
+// CHECK-NEXT:                 }
+// CHECK-NEXT:                ]
+// CHECK-NEXT:               }
+// CHECK-NEXT:              ]
+// CHECK-NEXT:             }
 // CHECK-NEXT:            ]
 // CHECK-NEXT:           }
 // CHECK-NEXT:          ]
@@ -1900,6 +1936,32 @@ int main()
 // CHECK-NEXT:              "kind": "VarTemplateDecl",
 // CHECK-NEXT:              "name": "is_reference_v"
 // CHECK-NEXT:             }
+// CHECK-NEXT:            ],
+// CHECK-NEXT:            "inner": [
+// CHECK-NEXT:             {
+// CHECK-NEXT:              "kind": "TemplateArgument",
+// CHECK-NEXT:              "type": {
+// CHECK-NEXT:               "qualType": "_Ty"
+// CHECK-NEXT:              },
+// CHECK-NEXT:              "inner": [
+// CHECK-NEXT:               {
+// CHECK-NEXT:                "id": "0x{{.*}}",
+// CHECK-NEXT:                "kind": "TemplateTypeParmType",
+// CHECK-NEXT:                "type": {
+// CHECK-NEXT:                 "qualType": "_Ty"
+// CHECK-NEXT:                },
+// CHECK-NEXT:                "isDependent": true,
+// CHECK-NEXT:                "isInstantiationDependent": true,
+// CHECK-NEXT:                "depth": 0,
+// CHECK-NEXT:                "index": 0,
+// CHECK-NEXT:                "decl": {
+// CHECK-NEXT:                 "id": "0x{{.*}}",
+// CHECK-NEXT:                 "kind": "TemplateTypeParmDecl",
+// CHECK-NEXT:                 "name": "_Ty"
+// CHECK-NEXT:                }
+// CHECK-NEXT:               }
+// CHECK-NEXT:              ]
+// CHECK-NEXT:             }
 // CHECK-NEXT:            ]
 // CHECK-NEXT:           }
 // CHECK-NEXT:          ]
@@ -2565,6 +2627,32 @@ int main()
 // CHECK-NEXT:            "kind": "VarTemplateDecl",
 // CHECK-NEXT:            "name": "is_function_v"
 // CHECK-NEXT:           }
+// CHECK-NEXT:          ],
+// CHECK-NEXT:          "inner": [
+// CHECK-NEXT:           {
+// CHECK-NEXT:            "kind": "TemplateArgument",
+// CHECK-NEXT:            "type": {
+// CHECK-NEXT:             "qualType": "_Ty1"
+// CHECK-NEXT:            },
+// CHECK-NEXT:            "inner": [
+// CHECK-NEXT:             {
+// CHECK-NEXT:              "id": "0x{{.*}}",
+// CHECK-NEXT:              "kind": "TemplateTypeParmType",
+// CHECK-NEXT:              "type": {
+// CHECK-NEXT:               "qualType": "_Ty1"
+// CHECK-NEXT:              },
+// CHECK-NEXT:              "isDependent": true,
+// CHECK-NEXT:              "isInstantiationDependent": true,
+// CHECK-NEXT:              "depth": 0,
+// CHECK-NEXT:              "index": 0,
+// CHECK-NEXT:              "decl": {
+// CHECK-NEXT:               "id": "0x{{.*}}",
+// CHECK-NEXT:               "kind": "TemplateTypeParmDecl",
+// CHECK-NEXT:               "name": "_Ty1"
+// CHECK-NEXT:              }
+// CHECK-NEXT:             }
+// CHECK-NEXT:            ]
+// CHECK-NEXT:           }
 // CHECK-NEXT:          ]
 // CHECK-NEXT:         }
 // CHECK-NEXT:        ]
diff --git a/clang/test/AST/ast-dump-templates.cpp b/clang/test/AST/ast-dump-templates.cpp
index d25ef36dd4d3..9fcafbcbcc46 100644
--- a/clang/test/AST/ast-dump-templates.cpp
+++ b/clang/test/AST/ast-dump-templates.cpp
@@ -104,3 +104,17 @@ void (*q)() = f<>;
 // CHECK1: template<> void f<0L>()
 // CHECK1: template<> void f<0U>()
 }
+
+namespace test6 {
+template <class D>
+constexpr bool C = true;
+
+template <class Key>
+void func() {
+  C<Key>;
+// DUMP:      UnresolvedLookupExpr {{.*}} '<dependent type>' lvalue (no ADL) = 'C'
+// DUMP-NEXT: `-TemplateArgument type 'Key'
+// DUMP-NEXT:   `-TemplateTypeParmType {{.*}} 'Key' dependent depth 0 index 0
+// DUMP-NEXT:     `-TemplateTypeParm {{.*}} 'Key'
+}
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/call-args-regression-traverse-decl-crash.cpp b/clang/test/Analysis/Checkers/WebKit/call-args-regression-traverse-decl-crash.cpp
new file mode 100644
index 000000000000..3d8e822025f6
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/call-args-regression-traverse-decl-crash.cpp
@@ -0,0 +1,7 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s
+// expected-no-diagnostics
+
+template <class Class> struct T;
+template <template <class> class Class, class Type>
+struct T<Class<Type>>
+{ };
diff --git a/clang/test/CXX/drs/cwg2149.cpp b/clang/test/CXX/drs/cwg2149.cpp
index d0f8cb2dfc0a..8416e42cbd69 100644
--- a/clang/test/CXX/drs/cwg2149.cpp
+++ b/clang/test/CXX/drs/cwg2149.cpp
@@ -11,7 +11,7 @@
 // cxx98-error@-1 {{variadic macros are a C99 feature}}
 #endif
 
-namespace cwg2149 { // cwg2149: 3.1 drafting 2024-04
+namespace cwg2149 { // cwg2149: 3.1
 #if __cplusplus <= 201103L
 struct X { int i, j, k; };
 #else
diff --git a/clang/test/CXX/drs/cwg650.cpp b/clang/test/CXX/drs/cwg650.cpp
index dcb844095b05..33ea179986e3 100644
--- a/clang/test/CXX/drs/cwg650.cpp
+++ b/clang/test/CXX/drs/cwg650.cpp
@@ -4,7 +4,7 @@
 // RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
 // RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
 // RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
-// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// We aren't testing this since C++26 because of P2748R5 "Disallow Binding a Returned Glvalue to a Temporary".
 
 #if __cplusplus == 199711L
 #define NOTHROW throw()
diff --git a/clang/test/CXX/drs/dr20xx.cpp b/clang/test/CXX/drs/dr20xx.cpp
index 291a77e0cc71..9797097acce7 100644
--- a/clang/test/CXX/drs/dr20xx.cpp
+++ b/clang/test/CXX/drs/dr20xx.cpp
@@ -90,7 +90,7 @@ namespace cwg2026 { // cwg2026: 11
   }
 }
 
-namespace cwg2049 { // cwg2049: 18 drafting P2308R1
+namespace cwg2049 { // cwg2049: 18
 #if __cplusplus >= 202302L
 template <int* x = {}> struct X {};
 X<> a;
diff --git a/clang/test/CXX/drs/dr21xx.cpp b/clang/test/CXX/drs/dr21xx.cpp
index 4fab10c279aa..082deb42e4fa 100644
--- a/clang/test/CXX/drs/dr21xx.cpp
+++ b/clang/test/CXX/drs/dr21xx.cpp
@@ -175,6 +175,8 @@ void foo() {
 }
 }
 
+// cwg2149 is in cwg2149.cpp
+
 namespace cwg2157 { // cwg2157: 11
 #if __cplusplus >= 201103L
   enum E : int;
diff --git a/clang/test/CXX/drs/dr24xx.cpp b/clang/test/CXX/drs/dr24xx.cpp
index 5ffaebda68c1..9f876cd87083 100644
--- a/clang/test/CXX/drs/dr24xx.cpp
+++ b/clang/test/CXX/drs/dr24xx.cpp
@@ -45,7 +45,7 @@ void fallthrough(int n) {
 #endif
 }
 
-namespace cwg2450 { // cwg2450: 18 review P2308R1
+namespace cwg2450 { // cwg2450: 18
 #if __cplusplus >= 202302L
 struct S {int a;};
 template <S s>
@@ -59,7 +59,7 @@ f<{.a= 0}>();
 #endif
 }
 
-namespace cwg2459 { // cwg2459: 18 drafting P2308R1
+namespace cwg2459 { // cwg2459: 18
 #if __cplusplus >= 202302L
 struct A {
   constexpr A(float) {}
diff --git a/clang/test/CXX/drs/dr25xx.cpp b/clang/test/CXX/drs/dr25xx.cpp
index 62b2a0a088cc..8bca58f44944 100644
--- a/clang/test/CXX/drs/dr25xx.cpp
+++ b/clang/test/CXX/drs/dr25xx.cpp
@@ -130,12 +130,14 @@ struct D3 : B {
 #endif
 
 #if __cplusplus >= 202302L
-namespace cwg2561 { // cwg2561: 18 review 2023-11-09
+namespace cwg2561 { // cwg2561: no tentatively ready 2024-03-18
 struct C {
     constexpr C(auto) { }
 };
 void foo() {
     constexpr auto b = [](this C) { return 1; };
+    // FIXME: closure type shouldn't have a conversion function to function
+    //        pointer, because explicit object parameter is present. 
     constexpr int (*fp)(C) = b;
     static_assert(fp(1) == 1);
     static_assert((&decltype(b)::operator())(1) == 1);
diff --git a/clang/test/CXX/drs/dr28xx.cpp b/clang/test/CXX/drs/dr28xx.cpp
index 4d9b0c76758d..be35d366bdd6 100644
--- a/clang/test/CXX/drs/dr28xx.cpp
+++ b/clang/test/CXX/drs/dr28xx.cpp
@@ -10,7 +10,15 @@
 // expected-no-diagnostics
 #endif
 
-namespace cwg2847 { // cwg2847: 19
+namespace cwg2819 { // cwg2819: 19 tentatively ready 2023-12-01
+#if __cpp_constexpr >= 202306L
+  constexpr void* p = nullptr;
+  constexpr int* q = static_cast<int*>(p);
+  static_assert(q == nullptr);
+#endif
+}
+
+namespace cwg2847 { // cwg2847: 19 review 2024-03-01
 
 #if __cplusplus >= 202002L
 
@@ -59,7 +67,7 @@ void B<int>::g() requires true;
 
 } // namespace cwg2847
 
-namespace cwg2858 { // cwg2858: 19
+namespace cwg2858 { // cwg2858: 19 tentatively ready 2024-04-05
 
 #if __cplusplus > 202302L
 
diff --git a/clang/test/CXX/expr/expr.const/p5-26.cpp b/clang/test/CXX/expr/expr.const/p5-26.cpp
index 3624b1e5a3e3..7513b11c09aa 100644
--- a/clang/test/CXX/expr/expr.const/p5-26.cpp
+++ b/clang/test/CXX/expr/expr.const/p5-26.cpp
@@ -37,3 +37,10 @@ void err() {
                                                 // cxx23-note {{cast from 'void *' is not allowed in a constant expression in C++ standards before C++2c}} \
                                                 // cxx26-note {{cast from 'void *' is not allowed in a constant expression because the pointed object type 'T' is not similar to the target type 'S'}}
 }
+
+int* p;
+constexpr int** pp = &p;
+constexpr void* vp = pp;
+constexpr auto cvp = static_cast<const int* volatile*>(vp);
+// cxx23-error@-1 {{constant expression}}
+// cxx23-note@-2 {{cast from 'void *' is not allowed in a constant expression}}
diff --git a/clang/test/CXX/stmt.stmt/stmt.return/p6.cpp b/clang/test/CXX/stmt.stmt/stmt.return/p6.cpp
new file mode 100644
index 000000000000..c192b0c8112a
--- /dev/null
+++ b/clang/test/CXX/stmt.stmt/stmt.return/p6.cpp
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -std=c++26 -fsyntax-only -verify %s
+
+auto&& f1() {
+  return 42; // expected-error{{returning reference to local temporary object}}
+}
+const double& f2() {
+  static int x = 42;
+  return x; // expected-error{{returning reference to local temporary object}}
+}
+auto&& id(auto&& r) {
+  return static_cast<decltype(r)&&>(r);
+}
+auto&& f3() {
+  return id(42);        // OK, but probably a bug
+}
+
+void unevaluated() {
+  using a = decltype ([] () -> const int & {
+    const int &i = 0; // expected-note {{binding reference variable 'i' here}}
+    return i; // expected-error{{returning reference to local temporary object}}
+} ());
+}
+
+static_assert(__is_convertible(int, const int &));
+static_assert(__is_nothrow_convertible(int, const int &));
diff --git a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
index f50eaf371028..0f2c5b2546fa 100644
--- a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
+++ b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
@@ -426,11 +426,12 @@ bfloat16_t test_vcvth_bf16_f32(float32_t a) {
 // CHECK-NEXT:    [[__REINT_I:%.*]] = alloca bfloat, align 2
 // CHECK-NEXT:    [[__REINT1_I:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[__REINT_I]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__REINT_I]], align 2
-// CHECK-NEXT:    [[SHL_I:%.*]] = shl i32 [[TMP1]], 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[__REINT_I]], align 2
+// CHECK-NEXT:    [[CONV_I:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK-NEXT:    [[SHL_I:%.*]] = shl i32 [[CONV_I]], 16
 // CHECK-NEXT:    store i32 [[SHL_I]], ptr [[__REINT1_I]], align 4
-// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[__REINT1_I]], align 4
-// CHECK-NEXT:    ret float [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[__REINT1_I]], align 4
+// CHECK-NEXT:    ret float [[TMP1]]
 //
 float32_t test_vcvtah_f32_bf16(bfloat16_t a) {
   return vcvtah_f32_bf16(a);
diff --git a/clang/test/CodeGen/arm64-microsoft-arguments.cpp b/clang/test/CodeGen/arm64-microsoft-arguments.cpp
index e8309888dcfe..85472645acb3 100644
--- a/clang/test/CodeGen/arm64-microsoft-arguments.cpp
+++ b/clang/test/CodeGen/arm64-microsoft-arguments.cpp
@@ -201,3 +201,18 @@ S11 f11() {
   S11 x;
   return func11(x);
 }
+
+// GH86384
+// Pass and return object with template constructor (pass directly,
+// return indirectly).
+// CHECK: define dso_local void @"?f12@@YA?AUS12@@XZ"(ptr dead_on_unwind inreg noalias writable sret(%struct.S12) align 4 {{.*}})
+// CHECK: call void @"?func12@@YA?AUS12@@U1@@Z"(ptr dead_on_unwind inreg writable sret(%struct.S12) align 4 {{.*}}, i64 {{.*}})
+struct S12 {
+  template<typename T> S12(T*) {}
+  int x;
+};
+S12 func12(S12 x);
+S12 f12() {
+  S12 x((int*)0);
+  return func12(x);
+}
diff --git a/clang/test/CodeGen/builtins-reduction-math.c b/clang/test/CodeGen/builtins-reduction-math.c
index 34f39cea5265..acafe9222d59 100644
--- a/clang/test/CodeGen/builtins-reduction-math.c
+++ b/clang/test/CodeGen/builtins-reduction-math.c
@@ -1,5 +1,8 @@
 // RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -O1 -triple aarch64 -target-feature +sve  %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=SVE   %s
+
 typedef float float4 __attribute__((ext_vector_type(4)));
 typedef short int si8 __attribute__((ext_vector_type(8)));
 typedef unsigned int u4 __attribute__((ext_vector_type(4)));
@@ -134,3 +137,53 @@ void test_builtin_reduce_and(si8 vi1, u4 vu1) {
   // CHECK-NEXT: call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[VU1]])
   unsigned r3 = __builtin_reduce_and(vu1);
 }
+
+#if defined(__ARM_FEATURE_SVE)
+#include <arm_sve.h>
+
+void test_builtin_reduce_SVE(int a, unsigned long long b, short c, float d) {
+  // SVE-LABEL: void @test_builtin_reduce_SVE(
+
+  svint32_t vec_a = svdup_s32(a);
+  svuint64_t vec_b = svdup_u64(b);
+  svint16_t vec_c1 = svdup_s16(c);
+  svuint16_t vec_c2 = svdup_u16(c);
+  svfloat32_t vec_d = svdup_f32(d);
+
+  // SVE:      [[VF1:%.+]] = load <vscale x 4 x i32>, ptr %vec_a
+  // SVE-NEXT: call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[VF1]])
+  int r1 = __builtin_reduce_add(vec_a);
+
+  // SVE:      [[VF2:%.+]] = load <vscale x 4 x i32>, ptr %vec_a
+  // SVE-NEXT: call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> [[VF2]])
+  int r2 = __builtin_reduce_mul(vec_a);
+
+  // SVE:      [[VF3:%.+]] = load <vscale x 2 x i64>, ptr %vec_b
+  // SVE-NEXT: call i64 @llvm.vector.reduce.xor.nxv2i64(<vscale x 2 x i64> [[VF3]])
+  long long r3 = __builtin_reduce_xor(vec_b);
+
+  // SVE:      [[VF4:%.+]] = load <vscale x 2 x i64>, ptr %vec_b
+  // SVE-NEXT: call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[VF4]])
+  long long r4 = __builtin_reduce_or(vec_b);
+
+  // SVE:      [[VF5:%.+]] = load <vscale x 2 x i64>, ptr %vec_b
+  // SVE-NEXT: call i64 @llvm.vector.reduce.and.nxv2i64(<vscale x 2 x i64> [[VF5]])
+  long long r5 = __builtin_reduce_and(vec_b);
+
+  // SVE:      [[VF6:%.+]] = load <vscale x 8 x i16>, ptr %vec_c1
+  // SVE-NEXT: call i16 @llvm.vector.reduce.smax.nxv8i16(<vscale x 8 x i16> [[VF6]])
+  short r6 = __builtin_reduce_max(vec_c1);
+
+  // SVE:      [[VF7:%.+]] = load <vscale x 8 x i16>, ptr %vec_c2
+  // SVE-NEXT: call i16 @llvm.vector.reduce.umin.nxv8i16(<vscale x 8 x i16> [[VF7]])
+  unsigned short r7 = __builtin_reduce_min(vec_c2);
+
+  // SVE:      [[VF8:%.+]] = load <vscale x 4 x float>, ptr %vec_d
+  // SVE-NEXT: call float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> [[VF8]])
+  float r8 = __builtin_reduce_max(vec_d);
+
+  // SVE:      [[VF9:%.+]] = load <vscale x 4 x float>, ptr %vec_d
+  // SVE-NEXT: call float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> [[VF9]])
+  float r9 = __builtin_reduce_min(vec_d);
+}
+#endif
diff --git a/clang/test/CodeGen/regparm-flag.c b/clang/test/CodeGen/regparm-flag.c
index c35b53cd4e19..d888c1e344c0 100644
--- a/clang/test/CodeGen/regparm-flag.c
+++ b/clang/test/CodeGen/regparm-flag.c
@@ -1,4 +1,8 @@
 // RUN: %clang_cc1 -triple i386-unknown-unknown -mregparm 4 %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple i386-unknown-unknown -fsanitize=array-bounds %s -emit-llvm -o - | FileCheck %s --check-prefix=RUNTIME0
+// RUN: %clang_cc1 -triple i386-unknown-unknown -mregparm 1 -fsanitize=array-bounds %s -emit-llvm -o - | FileCheck %s --check-prefix=RUNTIME1
+// RUN: %clang_cc1 -triple i386-unknown-unknown -mregparm 2 -fsanitize=array-bounds %s -emit-llvm -o - | FileCheck %s --check-prefix=RUNTIME2
+// RUN: %clang_cc1 -triple i386-unknown-unknown -mregparm 3 -fsanitize=array-bounds %s -emit-llvm -o - | FileCheck %s --check-prefix=RUNTIME2
 
 void f1(int a, int b, int c, int d,
         int e, int f, int g, int h);
@@ -13,7 +17,21 @@ void f0(void) {
   f2(1, 2);
 }
 
+struct has_array {
+  int a;
+  int b[4];
+  int c;
+};
+
+int access(struct has_array *p, int index)
+{
+  return p->b[index];
+}
+
 // CHECK: declare void @f1(i32 inreg noundef, i32 inreg noundef, i32 inreg noundef, i32 inreg noundef,
 // CHECK: i32 noundef, i32 noundef, i32 noundef, i32 noundef)
 // CHECK: declare void @f2(i32 noundef, i32 noundef)
 
+// RUNTIME0: declare void @__ubsan_handle_out_of_bounds_abort(ptr, i32)
+// RUNTIME1: declare void @__ubsan_handle_out_of_bounds_abort(ptr inreg, i32)
+// RUNTIME2: declare void @__ubsan_handle_out_of_bounds_abort(ptr inreg, i32 inreg)
diff --git a/clang/test/CodeGenCXX/blocks.cpp b/clang/test/CodeGenCXX/blocks.cpp
index eaab1890dfc4..afe078890553 100644
--- a/clang/test/CodeGenCXX/blocks.cpp
+++ b/clang/test/CodeGenCXX/blocks.cpp
@@ -149,8 +149,8 @@ namespace test5 {
   // CHECK-NEXT: [[X:%.*]] = alloca [[A:%.*]], align 4
   // CHECK-NEXT: [[B:%.*]] = alloca ptr, align 8
   // CHECK-NEXT: [[BLOCK:%.*]] = alloca [[BLOCK_T:.*]], align 8
-  // CHECK-NEXT: [[CLEANUP_ACTIVE:%.*]] = alloca i1
   // CHECK-NEXT: [[COND_CLEANUP_SAVE:%.*]] = alloca ptr, align 8
+  // CHECK-NEXT: [[CLEANUP_ACTIVE:%.*]] = alloca i1
   // CHECK-NEXT: [[T0:%.*]] = zext i1
   // CHECK-NEXT: store i8 [[T0]], ptr [[COND]], align 1
   // CHECK-NEXT: call void @_ZN5test51AC1Ev(ptr {{[^,]*}} [[X]])
@@ -162,8 +162,8 @@ namespace test5 {
   // CHECK-NOT:  br
   // CHECK:      [[CAPTURE:%.*]] = getelementptr inbounds [[BLOCK_T]], ptr [[BLOCK]], i32 0, i32 5
   // CHECK-NEXT: call void @_ZN5test51AC1ERKS0_(ptr {{[^,]*}} [[CAPTURE]], ptr noundef nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) [[X]])
-  // CHECK-NEXT: store i1 true, ptr [[CLEANUP_ACTIVE]]
   // CHECK-NEXT: store ptr [[CAPTURE]], ptr [[COND_CLEANUP_SAVE]], align 8
+  // CHECK-NEXT: store i1 true, ptr [[CLEANUP_ACTIVE]]
   // CHECK-NEXT: br label
   // CHECK:      br label
   // CHECK:      phi
diff --git a/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp b/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp
new file mode 100644
index 000000000000..ac466ee5bba4
--- /dev/null
+++ b/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp
@@ -0,0 +1,522 @@
+// RUN: %clang_cc1 --std=c++20 -fexceptions -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck -check-prefixes=EH %s
+// RUN: %clang_cc1 --std=c++20 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck -check-prefixes=NOEH,CHECK %s
+
+struct Printy {
+  Printy(const char *name) : name(name) {}
+  ~Printy() {}
+  const char *name;
+};
+
+int foo() { return 2; }
+
+struct Printies {
+  Printy a;
+  Printy b;
+  Printy c;
+};
+
+void ParenInit() {
+  // CHECK-LABEL: define dso_local void @_Z9ParenInitv()
+  // CHECK: [[CLEANUP_DEST:%.+]] = alloca i32, align 4
+  Printies ps(Printy("a"), 
+              // CHECK: call void @_ZN6PrintyC1EPKc
+              ({
+                if (foo()) return;
+                // CHECK:     if.then:
+                // CHECK-NEXT:   store i32 1, ptr [[CLEANUP_DEST]], align 4
+                // CHECK-NEXT:   br label %cleanup
+                Printy("b");
+                // CHECK:     if.end:
+                // CHECK-NEXT:  call void @_ZN6PrintyC1EPKc
+              }),
+              ({
+                if (foo()) return;
+                // CHECK:     if.then{{.*}}:
+                // CHECK-NEXT:  store i32 1, ptr [[CLEANUP_DEST]], align 4
+                // CHECK-NEXT:  call void @_ZN6PrintyD1Ev
+                // CHECK-NEXT:  br label %cleanup
+                Printy("c");
+                // CHECK:     if.end{{.*}}:
+                // CHECK-NEXT:  call void @_ZN6PrintyC1EPKc
+                // CHECK-NEXT:  call void @_ZN8PrintiesD1Ev
+                // CHECK-NEXT:  br label %return
+              }));
+  // CHECK:     cleanup:
+  // CHECK-NEXT:  call void @_ZN6PrintyD1Ev
+  // CHECK-NEXT:  br label %return
+}
+
+void break_in_stmt_expr() {
+  // Verify that the "break" in "if.then".calls dtor before jumping to "for.end".
+
+  // CHECK-LABEL: define dso_local void @_Z18break_in_stmt_exprv()
+  Printies p{Printy("a"), 
+            // CHECK: call void @_ZN6PrintyC1EPKc
+            ({
+                for (;;) {
+                    Printies ps{
+                      Printy("b"), 
+                      // CHECK: for.cond:
+                      // CHECK:   call void @_ZN6PrintyC1EPKc
+                      ({
+                        if (foo()) {
+                          break;
+                          // CHECK:       if.then:
+                          // CHECK-NEXT:    call void @_ZN6PrintyD1Ev
+                          // CHECK-NEXT:    br label %for.end
+                        }
+                        Printy("c");
+                        // CHECK:       if.end:
+                        // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
+                      }),
+                      Printy("d")};
+                      // CHECK:           call void @_ZN6PrintyC1EPKc
+                      // CHECK-NEXT:      call void @_ZN8PrintiesD1Ev
+                      // CHECK-NEXT:      br label %for.cond
+                }
+                Printy("e");
+  // CHECK:       for.end:
+  // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
+              }),
+              Printy("f")};
+  // CHECK:         call void @_ZN6PrintyC1EPKc
+  // CHECK-NEXT:    call void @_ZN8PrintiesD1Ev
+}
+
+void goto_in_stmt_expr() {
+  // Verify that:
+  //  - correct branch fixups for deactivated normal cleanups are generated correctly.
+
+  // CHECK-LABEL: define dso_local void @_Z17goto_in_stmt_exprv()
+  // CHECK: [[CLEANUP_DEST_SLOT:%cleanup.dest.slot.*]] = alloca i32, align 4
+  {
+    Printies p1{Printy("a"), // CHECK: call void @_ZN6PrintyC1EPKc
+                ({
+                  {
+                    Printies p2{Printy("b"),
+                                // CHECK: call void @_ZN6PrintyC1EPKc
+                                ({
+                                  if (foo() == 1) {
+                                    goto in;
+                                    // CHECK:       if.then:
+                                    // CHECK-NEXT:    store i32 2, ptr [[CLEANUP_DEST_SLOT]], align 4
+                                    // CHECK-NEXT:    br label %[[CLEANUP1:.+]]
+                                  }
+                                  if (foo() == 2) {
+                                    goto out;
+                                    // CHECK:       if.then{{.*}}:
+                                    // CHECK-NEXT:    store i32 3, ptr [[CLEANUP_DEST_SLOT]], align 4
+                                    // CHECK-NEXT:    br label %[[CLEANUP1]]
+                                  }
+                                  Printy("c");
+                                  // CHECK:       if.end{{.*}}:
+                                  // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
+                                }),
+                                Printy("d")};
+                                // CHECK:           call void @_ZN6PrintyC1EPKc
+                                // CHECK-NEXT:      call void @_ZN8PrintiesD1Ev
+                                // CHECK-NEXT:      br label %in
+
+                  }
+                in:
+                  Printy("e");
+                // CHECK:       in:                                               ; preds = %if.end{{.*}}, %[[CLEANUP1]]
+                // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
+                }),
+                Printy("f")};
+                // CHECK:         call void @_ZN6PrintyC1EPKc
+                // CHECK-NEXT:    call void @_ZN8PrintiesD1Ev
+                // CHECK-NEXT:    br label %out
+  }
+out:
+  return;
+  // CHECK:       out:
+  // CHECK-NEXT:    ret void
+
+  // CHECK:       [[CLEANUP1]]:                                          ; preds = %if.then{{.*}}, %if.then
+  // CHECK-NEXT:    call void @_ZN6PrintyD1Ev
+  // CHECK-NEXT:    %cleanup.dest = load i32, ptr [[CLEANUP_DEST_SLOT]], align 4
+  // CHECK-NEXT:    switch i32 %cleanup.dest, label %[[CLEANUP2:.+]] [
+  // CHECK-NEXT:      i32 2, label %in
+  // CHECK-NEXT:    ]
+
+  // CHECK:       [[CLEANUP2]]:                                         ; preds = %[[CLEANUP1]]
+  // CHECK-NEXT:    call void @_ZN6PrintyD1Ev
+  // CHECK-NEXT:    %cleanup.dest{{.*}} = load i32, ptr [[CLEANUP_DEST_SLOT]], align 4
+  // CHECK-NEXT:    switch i32 %cleanup.dest{{.*}}, label %unreachable [
+  // CHECK-NEXT:      i32 3, label %out
+  // CHECK-NEXT:    ]
+}
+
+void ArrayInit() {
+  // Printy arr[4] = {ctorA, ctorB, stmt-exprC, stmt-exprD};
+  // Verify that:
+  //  - We do the necessary stores for array cleanups (endOfInit and last constructed element).
+  //  - We update the array init element correctly for ctorA, ctorB and stmt-exprC.
+  //  - stmt-exprC and stmt-exprD share the array body dtor code (see %cleanup).
+
+  // CHECK-LABEL: define dso_local void @_Z9ArrayInitv()
+  // CHECK: %arrayinit.endOfInit = alloca ptr, align 8
+  // CHECK: %cleanup.dest.slot = alloca i32, align 4
+  // CHECK: %arrayinit.begin = getelementptr inbounds [4 x %struct.Printy], ptr %arr, i64 0, i64 0
+  // CHECK: store ptr %arrayinit.begin, ptr %arrayinit.endOfInit, align 8
+  Printy arr[4] = {
+    Printy("a"),
+    // CHECK: call void @_ZN6PrintyC1EPKc(ptr noundef nonnull align 8 dereferenceable(8) %arrayinit.begin, ptr noundef @.str)
+    // CHECK: [[ARRAYINIT_ELEMENT1:%.+]] = getelementptr inbounds %struct.Printy, ptr %arrayinit.begin, i64 1
+    // CHECK: store ptr [[ARRAYINIT_ELEMENT1]], ptr %arrayinit.endOfInit, align 8
+    Printy("b"),
+    // CHECK: call void @_ZN6PrintyC1EPKc(ptr noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_ELEMENT1]], ptr noundef @.str.1)
+    // CHECK: [[ARRAYINIT_ELEMENT2:%.+]] = getelementptr inbounds %struct.Printy, ptr [[ARRAYINIT_ELEMENT1]], i64 1
+    // CHECK: store ptr [[ARRAYINIT_ELEMENT2]], ptr %arrayinit.endOfInit, align 8
+    ({
+    // CHECK: br i1 {{.*}}, label %if.then, label %if.end
+      if (foo()) {
+        return;
+      // CHECK:       if.then:
+      // CHECK-NEXT:    store i32 1, ptr %cleanup.dest.slot, align 4
+      // CHECK-NEXT:    br label %cleanup
+      }
+      // CHECK:       if.end:
+      Printy("c");
+      // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
+      // CHECK-NEXT:    %arrayinit.element2 = getelementptr inbounds %struct.Printy, ptr %arrayinit.element1, i64 1
+      // CHECK-NEXT:    store ptr %arrayinit.element2, ptr %arrayinit.endOfInit, align 8
+    }),
+    ({
+    // CHECK: br i1 {{%.+}} label %[[IF_THEN2:.+]], label %[[IF_END2:.+]]
+      if (foo()) {
+        return;
+      // CHECK:       [[IF_THEN2]]:
+      // CHECK-NEXT:    store i32 1, ptr %cleanup.dest.slot, align 4
+      // CHECK-NEXT:    br label %cleanup
+      }
+      // CHECK:       [[IF_END2]]:
+      Printy("d");
+      // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
+      // CHECK-NEXT:    %array.begin = getelementptr inbounds [4 x %struct.Printy], ptr %arr, i32 0, i32 0
+      // CHECK-NEXT:    %0 = getelementptr inbounds %struct.Printy, ptr %array.begin, i64 4
+      // CHECK-NEXT:    br label %[[ARRAY_DESTROY_BODY1:.+]]
+  }),
+  };
+
+  // CHECK:       [[ARRAY_DESTROY_BODY1]]:
+  // CHECK-NEXT:    %arraydestroy.elementPast{{.*}} = phi ptr [ %0, %[[IF_END2]] ], [ %arraydestroy.element{{.*}}, %[[ARRAY_DESTROY_BODY1]] ]
+  // CHECK-NEXT:    %arraydestroy.element{{.*}} = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast{{.*}}, i64 -1
+  // CHECK-NEXT:    call void @_ZN6PrintyD1Ev
+  // CHECK-NEXT:    %arraydestroy.done{{.*}} = icmp eq ptr %arraydestroy.element{{.*}}, %array.begin
+  // CHECK-NEXT:    br i1 %arraydestroy.done{{.*}}, label %[[ARRAY_DESTROY_DONE1:.+]], label %[[ARRAY_DESTROY_BODY1]]
+
+  // CHECK:       [[ARRAY_DESTROY_DONE1]]:
+  // CHECK-NEXT:    ret void
+
+  // CHECK:       cleanup:
+  // CHECK-NEXT:    %1 = load ptr, ptr %arrayinit.endOfInit, align 8
+  // CHECK-NEXT:    %arraydestroy.isempty = icmp eq ptr %arrayinit.begin, %1
+  // CHECK-NEXT:    br i1 %arraydestroy.isempty, label %[[ARRAY_DESTROY_DONE2:.+]], label %[[ARRAY_DESTROY_BODY2:.+]]
+
+  // CHECK:       [[ARRAY_DESTROY_BODY2]]:
+  // CHECK-NEXT:    %arraydestroy.elementPast = phi ptr [ %1, %cleanup ], [ %arraydestroy.element, %[[ARRAY_DESTROY_BODY2]] ]
+  // CHECK-NEXT:    %arraydestroy.element = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast, i64 -1
+  // CHECK-NEXT:    call void @_ZN6PrintyD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element)
+  // CHECK-NEXT:    %arraydestroy.done = icmp eq ptr %arraydestroy.element, %arrayinit.begin
+  // CHECK-NEXT:    br i1 %arraydestroy.done, label %[[ARRAY_DESTROY_DONE2]], label %[[ARRAY_DESTROY_BODY2]]
+
+  // CHECK:       [[ARRAY_DESTROY_DONE2]]:
+  // CHECK-NEXT:    br label %[[ARRAY_DESTROY_DONE1]]
+}
+
+void ArraySubobjects() {
+  struct S {
+    Printy arr1[2];
+    Printy arr2[2];
+    Printy p;
+  };
+  // CHECK-LABEL: define dso_local void @_Z15ArraySubobjectsv()
+  // CHECK: %arrayinit.endOfInit = alloca ptr, align 8
+  S s{{Printy("a"), Printy("b")},
+      // CHECK: call void @_ZN6PrintyC1EPKc
+      // CHECK: call void @_ZN6PrintyC1EPKc
+      {Printy("a"),
+      // CHECK: [[ARRAYINIT_BEGIN:%.+]] = getelementptr inbounds [2 x %struct.Printy]
+      // CHECK: store ptr [[ARRAYINIT_BEGIN]], ptr %arrayinit.endOfInit, align 8
+      // CHECK: call void @_ZN6PrintyC1EPKc
+      // CHECK: [[ARRAYINIT_ELEMENT:%.+]] = getelementptr inbounds %struct.Printy
+      // CHECK: store ptr [[ARRAYINIT_ELEMENT]], ptr %arrayinit.endOfInit, align 8
+      ({
+         if (foo()) {
+           return;
+           // CHECK:      if.then:
+           // CHECK-NEXT:   [[V0:%.+]] = load ptr, ptr %arrayinit.endOfInit, align 8
+           // CHECK-NEXT:   %arraydestroy.isempty = icmp eq ptr [[ARRAYINIT_BEGIN]], [[V0]]
+           // CHECK-NEXT:   br i1 %arraydestroy.isempty, label %[[ARRAY_DESTROY_DONE:.+]], label %[[ARRAY_DESTROY_BODY:.+]]
+         }
+         Printy("b");
+       })
+      },
+      Printy("c")
+      // CHECK:       if.end:
+      // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
+      // CHECK:         call void @_ZN6PrintyC1EPKc
+      // CHECK-NEXT:    call void @_ZZ15ArraySubobjectsvEN1SD1Ev
+      // CHECK-NEXT:    br label %return
+    };
+    // CHECK:       return:
+    // CHECK-NEXT:    ret void
+
+    // CHECK:       [[ARRAY_DESTROY_BODY]]:
+    // CHECK-NEXT:    %arraydestroy.elementPast = phi ptr [ %0, %if.then ], [ %arraydestroy.element, %[[ARRAY_DESTROY_BODY]] ]
+    // CHECK-NEXT:    %arraydestroy.element = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast, i64 -1
+    // CHECK-NEXT:    call void @_ZN6PrintyD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element)
+    // CHECK-NEXT:    %arraydestroy.done = icmp eq ptr %arraydestroy.element, [[ARRAYINIT_BEGIN]]
+    // CHECK-NEXT:    br i1 %arraydestroy.done, label %[[ARRAY_DESTROY_DONE]], label %[[ARRAY_DESTROY_BODY]]
+
+    // CHECK:       [[ARRAY_DESTROY_DONE]]
+    // CHECK-NEXT:    [[ARRAY_BEGIN:%.+]] = getelementptr inbounds [2 x %struct.Printy], ptr %arr1, i32 0, i32 0
+    // CHECK-NEXT:    [[V1:%.+]] = getelementptr inbounds %struct.Printy, ptr [[ARRAY_BEGIN]], i64 2
+    // CHECK-NEXT:    br label %[[ARRAY_DESTROY_BODY2:.+]]
+
+    // CHECK:       [[ARRAY_DESTROY_BODY2]]:
+    // CHECK-NEXT:    %arraydestroy.elementPast5 = phi ptr [ %1, %[[ARRAY_DESTROY_DONE]] ], [ %arraydestroy.element6, %[[ARRAY_DESTROY_BODY2]] ]
+    // CHECK-NEXT:    %arraydestroy.element6 = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast5, i64 -1
+    // CHECK-NEXT:    call void @_ZN6PrintyD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element6)
+    // CHECK-NEXT:    %arraydestroy.done7 = icmp eq ptr %arraydestroy.element6, [[ARRAY_BEGIN]]
+    // CHECK-NEXT:    br i1 %arraydestroy.done7, label %[[ARRAY_DESTROY_DONE2:.+]], label %[[ARRAY_DESTROY_BODY2]]
+
+
+    // CHECK:     [[ARRAY_DESTROY_DONE2]]:
+    // CHECK-NEXT:  br label %return
+}
+
+void LambdaInit() {
+  // CHECK-LABEL: define dso_local void @_Z10LambdaInitv()
+  auto S = [a = Printy("a"), b = ({
+                               if (foo()) {
+                                 return;
+                                 // CHECK:       if.then:
+                                 // CHECK-NEXT:    call void @_ZN6PrintyD1Ev
+                                 // CHECK-NEXT:    br label %return
+                               }
+                               Printy("b");
+                             })]() { return a; };
+}
+
+struct PrintyRefBind {
+  const Printy &a;
+  const Printy &b;
+};
+
+struct Temp {
+  Temp();
+  ~Temp();
+};
+Temp CreateTemp();
+Printy CreatePrinty();
+Printy CreatePrinty(const Temp&);
+
+void LifetimeExtended() {
+  // CHECK-LABEL: define dso_local void @_Z16LifetimeExtendedv
+  PrintyRefBind ps = {Printy("a"), ({
+                        if (foo()) {
+                          return;
+                          // CHECK: if.then:
+                          // CHECK-NEXT: call void @_ZN6PrintyD1Ev
+                          // CHECK-NEXT: br label %return
+                        }
+                        Printy("b");
+                      })};
+}
+
+void ConditionalLifetimeExtended() {
+  // CHECK-LABEL: @_Z27ConditionalLifetimeExtendedv()
+
+  // Verify that we create two cleanup flags.
+  //  1. First for the cleanup which is deactivated after full expression.
+  //  2. Second for the life-ext cleanup which is activated if the branch is taken.
+
+  // Note: We use `CreateTemp()` to ensure that life-ext destroy cleanup is not at
+  // the top of EHStack on deactivation. This ensures using active flags.
+
+  Printy* p1 = nullptr;
+  // CHECK:       store i1 false, ptr [[BRANCH1_DEFERRED:%cleanup.cond]], align 1
+  // CHECK-NEXT:  store i1 false, ptr [[BRANCH1_LIFEEXT:%cleanup.cond.*]], align 1
+  PrintyRefBind ps = {
+      p1 != nullptr ? static_cast<const Printy&>(CreatePrinty())
+      // CHECK:       cond.true:
+      // CHECK-NEXT:    call void @_Z12CreatePrintyv
+      // CHECK-NEXT:    store i1 true, ptr [[BRANCH1_DEFERRED]], align 1
+      // CHECK-NEXT:    store i1 true, ptr [[BRANCH1_LIFEEXT]], align 1
+      // CHECK-NEXT:    br label %{{.*}}
+      : foo() ? static_cast<const Printy&>(CreatePrinty(CreateTemp()))
+              : *p1,
+      ({
+        if (foo()) return;
+        Printy("c");
+        // CHECK:       if.end:
+        // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
+        // CHECK-NEXT:    store ptr
+      })};
+      // CHECK-NEXT:      store i1 false, ptr [[BRANCH1_DEFERRED]], align 1
+      // CHECK-NEXT:      store i32 0, ptr %cleanup.dest.slot, align 4
+      // CHECK-NEXT:      br label %cleanup
+
+}
+
+void NewArrayInit() {
+  // CHECK-LABEL: define dso_local void @_Z12NewArrayInitv()
+  // CHECK: %array.init.end = alloca ptr, align 8
+  // CHECK: store ptr %0, ptr %array.init.end, align 8
+  Printy *array = new Printy[3]{
+    "a",
+    // CHECK: call void @_ZN6PrintyC1EPKc
+    // CHECK: store ptr %array.exp.next, ptr %array.init.end, align 8
+    "b", 
+    // CHECK: call void @_ZN6PrintyC1EPKc
+    // CHECK: store ptr %array.exp.next1, ptr %array.init.end, align 8
+    ({
+        if (foo()) {
+          return;
+          // CHECK: if.then:
+          // CHECK:   br i1 %arraydestroy.isempty, label %arraydestroy.done{{.*}}, label %arraydestroy.body
+        }
+        "b";
+        // CHECK: if.end:
+        // CHECK:   call void @_ZN6PrintyC1EPKc
+    })};
+  // CHECK:       arraydestroy.body:
+  // CHECK-NEXT:    %arraydestroy.elementPast = phi ptr [ %{{.*}}, %if.then ], [ %arraydestroy.element, %arraydestroy.body ]
+  // CHECK-NEXT:    %arraydestroy.element = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast, i64 -1
+  // CHECK-NEXT:    call void @_ZN6PrintyD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element)
+  // CHECK-NEXT:    %arraydestroy.done = icmp eq ptr %arraydestroy.element, %0
+  // CHECK-NEXT:    br i1 %arraydestroy.done, label %arraydestroy.done{{.*}}, label %arraydestroy.body
+
+  // CHECK:       arraydestroy.done{{.*}}:                               ; preds = %arraydestroy.body, %if.then
+  // CHECK-NEXT:    br label %return
+}
+
+void DestroyInConditionalCleanup() {
+  // EH-LABEL: DestroyInConditionalCleanupv()
+  // NOEH-LABEL: DestroyInConditionalCleanupv()
+  struct A {
+    A() {}
+    ~A() {}
+  };
+
+  struct Value {
+    Value(A) {}
+    ~Value() {}
+  };
+
+  struct V2 {
+    Value K;
+    Value V;
+  };
+  // Verify we use conditional cleanups.
+  (void)(foo() ? V2{A(), A()} : V2{A(), A()});
+  // NOEH:   cond.true:
+  // NOEH:      call void @_ZZ27DestroyInConditionalCleanupvEN1AC1Ev
+  // NOEH:      store ptr %{{.*}}, ptr %cond-cleanup.save
+
+  // EH:   cond.true:
+  // EH:        invoke void @_ZZ27DestroyInConditionalCleanupvEN1AC1Ev
+  // EH:        store ptr %{{.*}}, ptr %cond-cleanup.save
+}
+
+void ArrayInitWithContinue() {
+  // CHECK-LABEL: @_Z21ArrayInitWithContinuev
+  // Verify that we start to emit the array destructor.
+  // CHECK: %arrayinit.endOfInit = alloca ptr, align 8
+  for (int i = 0; i < 1; ++i) {
+    Printy arr[2] = {"a", ({
+                       if (foo()) {
+                         continue;
+                       }
+                       "b";
+                     })};
+  }
+}
+
+struct [[clang::trivial_abi]] HasTrivialABI {
+  HasTrivialABI();
+  ~HasTrivialABI();
+};
+void AcceptTrivialABI(HasTrivialABI, int);
+void TrivialABI() {
+  // CHECK-LABEL: define dso_local void @_Z10TrivialABIv()
+  AcceptTrivialABI(HasTrivialABI(), ({
+                     if (foo()) return;
+                     // CHECK:      if.then:
+                     // CHECK-NEXT:   call void @_ZN13HasTrivialABID1Ev
+                     // CHECK-NEXT:   br label %return
+                     0;
+                   }));
+}
+
+namespace CleanupFlag {
+struct A {
+  A() {}
+  ~A() {}
+};
+
+struct B {
+  B(const A&) {}
+  B() {}
+  ~B() {}
+};
+
+struct S {
+  A a;
+  B b;
+};
+
+int AcceptS(S s);
+
+void Accept2(int x, int y);
+
+void InactiveNormalCleanup() {
+  // CHECK-LABEL: define {{.*}}InactiveNormalCleanupEv()
+  
+  // The first A{} below is an inactive normal cleanup which
+  // is not popped from EHStack on deactivation. This needs an
+  // "active" cleanup flag.
+
+  // CHECK: [[ACTIVE:%cleanup.isactive.*]] = alloca i1, align 1
+  // CHECK: call void [[A_CTOR:@.*AC1Ev]]
+  // CHECK: store i1 true, ptr [[ACTIVE]], align 1
+  // CHECK: call void [[A_CTOR]]
+  // CHECK: call void [[B_CTOR:@.*BC1ERKNS_1AE]]
+  // CHECK: store i1 false, ptr [[ACTIVE]], align 1
+  // CHECK: call noundef i32 [[ACCEPTS:@.*AcceptSENS_1SE]]
+  Accept2(AcceptS({.a = A{}, .b = A{}}), ({
+            if (foo()) return;
+            // CHECK: if.then:
+            // CHECK:   br label %cleanup
+            0;
+            // CHECK: if.end:
+            // CHECK:   call void [[ACCEPT2:@.*Accept2Eii]]
+            // CHECK:   br label %cleanup
+          }));
+  // CHECK: cleanup:
+  // CHECK:   call void [[S_DTOR:@.*SD1Ev]]
+  // CHECK:   call void [[A_DTOR:@.*AD1Ev]]
+  // CHECK:   %cleanup.is_active = load i1, ptr [[ACTIVE]]
+  // CHECK:   br i1 %cleanup.is_active, label %cleanup.action, label %cleanup.done
+
+  // CHECK: cleanup.action:
+  // CHECK:   call void [[A_DTOR]]
+
+  // The "active" cleanup flag is not required for unused cleanups.
+  Accept2(AcceptS({.a = A{}, .b = A{}}), 0);
+  // CHECK: cleanup.cont:
+  // CHECK:   call void [[A_CTOR]]
+  // CHECK-NOT: store i1 true
+  // CHECK:   call void [[A_CTOR]]
+  // CHECK:   call void [[B_CTOR]]
+  // CHECK-NOT: store i1 false
+  // CHECK:   call noundef i32 [[ACCEPTS]]
+  // CHECK:   call void [[ACCEPT2]]
+  // CHECK:   call void [[S_DTOR]]
+  // CHECK:   call void [[A_DTOR]]
+  // CHECK:   br label %return
+}
+}  // namespace CleanupFlag
diff --git a/clang/test/CodeGenCXX/pragma-gcc-unroll.cpp b/clang/test/CodeGenCXX/pragma-gcc-unroll.cpp
index 8a94a5cc91e2..85f10fcdff14 100644
--- a/clang/test/CodeGenCXX/pragma-gcc-unroll.cpp
+++ b/clang/test/CodeGenCXX/pragma-gcc-unroll.cpp
@@ -116,6 +116,34 @@ void while_unroll_zero_test(int *List, int Length) {
   }
 }
 
+using size_t = unsigned long long;
+
+template <bool Flag>
+int value_dependent(int n) {
+  // CHECK: define {{.*}} @_Z15value_dependentILb1EEii
+  constexpr int N = 100;
+  auto init = [=]() { return Flag ? n : 0UL; };
+  auto cond = [=](size_t ix) { return Flag ? ix != 0 : ix < 10; };
+  auto iter = [=](size_t ix) {
+    return Flag ? ix & ~(1ULL << __builtin_clzll(ix)) : ix + 1;
+  };
+#pragma GCC unroll Flag ? 1 : N
+  for (size_t ix = init(); cond(ix); ix = iter(ix)) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_16:.*]]
+    n *= n;
+  }
+#pragma GCC unroll Flag ? 0 : N
+  for (size_t ix = init(); cond(ix); ix = iter(ix)) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_17:.*]]
+    n *= n;
+  }
+  return n;
+}
+
+void test_value_dependent(int n) {
+  value_dependent<true>(n);
+}
+
 // CHECK: ![[LOOP_1]] = distinct !{![[LOOP_1]], [[MP:![0-9]+]], ![[UNROLL_ENABLE:.*]]}
 // CHECK: ![[UNROLL_ENABLE]] = !{!"llvm.loop.unroll.enable"}
 // CHECK: ![[LOOP_2]] = distinct !{![[LOOP_2:.*]], ![[UNROLL_DISABLE:.*]]}
@@ -129,3 +157,5 @@ void while_unroll_zero_test(int *List, int Length) {
 // CHECK: ![[LOOP_7]] = distinct !{![[LOOP_7]], ![[UNROLL_8:.*]]}
 // CHECK: ![[LOOP_14]] = distinct !{![[LOOP_14]], [[MP]], ![[UNROLL_DISABLE:.*]]}
 // CHECK: ![[LOOP_15]] = distinct !{![[LOOP_15]], [[MP]], ![[UNROLL_DISABLE:.*]]}
+// CHECK: ![[LOOP_16]] = distinct !{![[LOOP_16]], [[MP]], ![[UNROLL_DISABLE:.*]]}
+// CHECK: ![[LOOP_17]] = distinct !{![[LOOP_17]], [[MP]], ![[UNROLL_DISABLE:.*]]}
diff --git a/clang/test/CodeGenCXX/pragma-unroll.cpp b/clang/test/CodeGenCXX/pragma-unroll.cpp
index 02d9bad7148d..6754788b7243 100644
--- a/clang/test/CodeGenCXX/pragma-unroll.cpp
+++ b/clang/test/CodeGenCXX/pragma-unroll.cpp
@@ -96,6 +96,54 @@ void template_test(double *List, int Length) {
   for_template_define_test<double>(List, Length, Value);
 }
 
+void for_unroll_zero_test(int *List, int Length) {
+  // CHECK: define {{.*}} @_Z20for_unroll_zero_testPii
+  #pragma unroll 0
+  for (int i = 0; i < Length; i++) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_14:.*]]
+    List[i] = i * 2;
+  }
+}
+
+void while_unroll_zero_test(int *List, int Length) {
+  // CHECK: define {{.*}} @_Z22while_unroll_zero_testPii
+  int i = 0;
+#pragma unroll(0)
+  while (i < Length) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_15:.*]]
+    List[i] = i * 2;
+    i++;
+  }
+}
+
+using size_t = unsigned long long;
+
+template <bool Flag>
+int value_dependent(int n) {
+  // CHECK: define {{.*}} @_Z15value_dependentILb1EEii
+  constexpr int N = 100;
+  auto init = [=]() { return Flag ? n : 0UL; };
+  auto cond = [=](size_t ix) { return Flag ? ix != 0 : ix < 10; };
+  auto iter = [=](size_t ix) {
+    return Flag ? ix & ~(1ULL << __builtin_clzll(ix)) : ix + 1;
+  };
+#pragma unroll Flag ? 1 : N
+  for (size_t ix = init(); cond(ix); ix = iter(ix)) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_16:.*]]
+    n *= n;
+  }
+#pragma unroll Flag ? 0 : N
+  for (size_t ix = init(); cond(ix); ix = iter(ix)) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_17:.*]]
+    n *= n;
+  }
+  return n;
+}
+
+void test_value_dependent(int n) {
+  value_dependent<true>(n);
+}
+
 // CHECK: ![[LOOP_1]] = distinct !{![[LOOP_1]], [[MP:![0-9]+]], ![[UNROLL_ENABLE:.*]]}
 // CHECK: ![[UNROLL_ENABLE]] = !{!"llvm.loop.unroll.enable"}
 // CHECK: ![[LOOP_2]] = distinct !{![[LOOP_2:.*]], ![[UNROLL_DISABLE:.*]]}
@@ -107,3 +155,7 @@ void template_test(double *List, int Length) {
 // CHECK: ![[LOOP_5]] = distinct !{![[LOOP_5]], ![[UNROLL_8:.*]]}
 // CHECK: ![[LOOP_6]] = distinct !{![[LOOP_6]], ![[UNROLL_8:.*]]}
 // CHECK: ![[LOOP_7]] = distinct !{![[LOOP_7]], ![[UNROLL_8:.*]]}
+// CHECK: ![[LOOP_14]] = distinct !{![[LOOP_14]], [[MP]], ![[UNROLL_DISABLE:.*]]}
+// CHECK: ![[LOOP_15]] = distinct !{![[LOOP_15]], [[MP]], ![[UNROLL_DISABLE:.*]]}
+// CHECK: ![[LOOP_16]] = distinct !{![[LOOP_16]], [[MP]], ![[UNROLL_DISABLE:.*]]}
+// CHECK: ![[LOOP_17]] = distinct !{![[LOOP_17]], [[MP]], ![[UNROLL_DISABLE:.*]]}
diff --git a/clang/test/CodeGenCoroutines/coro-elide-thinlto.cpp b/clang/test/CodeGenCoroutines/coro-elide-thinlto.cpp
new file mode 100644
index 000000000000..293aef678167
--- /dev/null
+++ b/clang/test/CodeGenCoroutines/coro-elide-thinlto.cpp
@@ -0,0 +1,77 @@
+// This tests that the coroutine elide optimization could happen succesfully with ThinLTO.
+// This test is adapted from coro-elide.cpp and splits functions into two files.
+//
+// RUN: split-file %s %t
+// RUN: %clang --target=x86_64-linux -std=c++20 -O2 -flto=thin -I %S -c %t/coro-elide-callee.cpp -o coro-elide-callee.o
+// RUN: %clang --target=x86_64-linux -std=c++20 -O2 -flto=thin -I %S -c %t/coro-elide-caller.cpp -o coro-elide-caller.o
+// RUN: llvm-lto -thinlto coro-elide-callee.o coro-elide-caller.o -o summary
+// RUN: %clang_cc1 -O2 -x ir coro-elide-caller.o -fthinlto-index=summary.thinlto.bc -emit-llvm -o - | FileCheck %s
+
+//--- coro-elide-task.h
+#pragma once
+#include "Inputs/coroutine.h"
+
+struct Task {
+  struct promise_type {
+    struct FinalAwaiter {
+      bool await_ready() const noexcept { return false; }
+      template <typename PromiseType>
+      std::coroutine_handle<> await_suspend(std::coroutine_handle<PromiseType> h) noexcept {
+        if (!h)
+          return std::noop_coroutine();
+        return h.promise().continuation;
+      }
+      void await_resume() noexcept {}
+    };
+    Task get_return_object() noexcept {
+      return std::coroutine_handle<promise_type>::from_promise(*this);
+    }
+    std::suspend_always initial_suspend() noexcept { return {}; }
+    FinalAwaiter final_suspend() noexcept { return {}; }
+    void unhandled_exception() noexcept {}
+    void return_value(int x) noexcept {
+      _value = x;
+    }
+    std::coroutine_handle<> continuation;
+    int _value;
+  };
+
+  Task(std::coroutine_handle<promise_type> handle) : handle(handle) {}
+  ~Task() {
+    if (handle)
+      handle.destroy();
+  }
+
+  struct Awaiter {
+    bool await_ready() const noexcept { return false; }
+    void await_suspend(std::coroutine_handle<void> continuation) noexcept {}
+    int await_resume() noexcept {
+      return 43;
+    }
+  };
+
+  auto operator co_await() {
+    return Awaiter{};
+  }
+
+private:
+  std::coroutine_handle<promise_type> handle;
+};
+
+//--- coro-elide-callee.cpp
+#include "coro-elide-task.h"
+Task task0() {
+  co_return 43;
+}
+
+//--- coro-elide-caller.cpp
+#include "coro-elide-task.h"
+
+Task task0();
+
+Task task1() {
+  co_return co_await task0();
+}
+
+// CHECK-LABEL: define{{.*}} void @_Z5task1v.resume
+// CHECK-NOT: {{.*}}_Znwm
diff --git a/clang/test/CodeGenCoroutines/coro-suspend-cleanups.cpp b/clang/test/CodeGenCoroutines/coro-suspend-cleanups.cpp
new file mode 100644
index 000000000000..06cc2069dbe9
--- /dev/null
+++ b/clang/test/CodeGenCoroutines/coro-suspend-cleanups.cpp
@@ -0,0 +1,93 @@
+// RUN: %clang_cc1 --std=c++20 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s
+
+#include "Inputs/coroutine.h"
+
+struct Printy {
+  Printy(const char *name) : name(name) {}
+  ~Printy() {}
+  const char *name;
+};
+
+struct coroutine {
+  struct promise_type;
+  std::coroutine_handle<promise_type> handle;
+  ~coroutine() {
+    if (handle) handle.destroy();
+  }
+};
+
+struct coroutine::promise_type {
+  coroutine get_return_object() {
+    return {std::coroutine_handle<promise_type>::from_promise(*this)};
+  }
+  std::suspend_never initial_suspend() noexcept { return {}; }
+  std::suspend_always final_suspend() noexcept { return {}; }
+  void return_void() {}
+  void unhandled_exception() {}
+};
+
+struct Awaiter : std::suspend_always {
+  Printy await_resume() { return {"awaited"}; }
+};
+
+int foo() { return 2; }
+
+coroutine ArrayInitCoro() {
+  // Verify that:
+  //  - We do the necessary stores for array cleanups.
+  //  - Array cleanups are called by await.cleanup.
+  //  - We activate the cleanup after the first element and deactivate it in await.ready (see cleanup.isactive).
+
+  // CHECK-LABEL: define dso_local void @_Z13ArrayInitCorov
+  // CHECK: %arrayinit.endOfInit = alloca ptr, align 8
+  // CHECK: %cleanup.isactive = alloca i1, align 1
+  Printy arr[2] = {
+    Printy("a"),
+    // CHECK:       %arrayinit.begin = getelementptr inbounds [2 x %struct.Printy], ptr %arr.reload.addr, i64 0, i64 0
+    // CHECK-NEXT:  %arrayinit.begin.spill.addr = getelementptr inbounds %_Z13ArrayInitCorov.Frame, ptr %0, i32 0, i32 10
+    // CHECK-NEXT:  store ptr %arrayinit.begin, ptr %arrayinit.begin.spill.addr, align 8
+    // CHECK-NEXT:  store i1 true, ptr %cleanup.isactive.reload.addr, align 1
+    // CHECK-NEXT:  store ptr %arrayinit.begin, ptr %arrayinit.endOfInit.reload.addr, align 8
+    // CHECK-NEXT:  call void @_ZN6PrintyC1EPKc(ptr noundef nonnull align 8 dereferenceable(8) %arrayinit.begin, ptr noundef @.str)
+    // CHECK-NEXT:  %arrayinit.element = getelementptr inbounds %struct.Printy, ptr %arrayinit.begin, i64 1
+    // CHECK-NEXT:  %arrayinit.element.spill.addr = getelementptr inbounds %_Z13ArrayInitCorov.Frame, ptr %0, i32 0, i32 11
+    // CHECK-NEXT:  store ptr %arrayinit.element, ptr %arrayinit.element.spill.addr, align 8
+    // CHECK-NEXT:  store ptr %arrayinit.element, ptr %arrayinit.endOfInit.reload.addr, align 8
+    co_await Awaiter{}
+    // CHECK-NEXT:  @_ZNSt14suspend_always11await_readyEv
+    // CHECK-NEXT:  br i1 %{{.+}}, label %await.ready, label %CoroSave30
+  };
+  // CHECK:       await.cleanup:                                    ; preds = %AfterCoroSuspend{{.*}}
+  // CHECK-NEXT:    br label %cleanup{{.*}}.from.await.cleanup
+
+  // CHECK:       cleanup{{.*}}.from.await.cleanup:                      ; preds = %await.cleanup
+  // CHECK:         br label %cleanup{{.*}}
+
+  // CHECK:       await.ready:
+  // CHECK-NEXT:    %arrayinit.element.reload.addr = getelementptr inbounds %_Z13ArrayInitCorov.Frame, ptr %0, i32 0, i32 11
+  // CHECK-NEXT:    %arrayinit.element.reload = load ptr, ptr %arrayinit.element.reload.addr, align 8
+  // CHECK-NEXT:    call void @_ZN7Awaiter12await_resumeEv
+  // CHECK-NEXT:    store i1 false, ptr %cleanup.isactive.reload.addr, align 1
+  // CHECK-NEXT:    br label %cleanup{{.*}}.from.await.ready
+
+  // CHECK:       cleanup{{.*}}:                                         ; preds = %cleanup{{.*}}.from.await.ready, %cleanup{{.*}}.from.await.cleanup
+  // CHECK:         %cleanup.is_active = load i1, ptr %cleanup.isactive.reload.addr, align 1
+  // CHECK-NEXT:    br i1 %cleanup.is_active, label %cleanup.action, label %cleanup.done
+
+  // CHECK:       cleanup.action:
+  // CHECK:         %arraydestroy.isempty = icmp eq ptr %arrayinit.begin.reload{{.*}}, %{{.*}}
+  // CHECK-NEXT:    br i1 %arraydestroy.isempty, label %arraydestroy.done{{.*}}, label %arraydestroy.body.from.cleanup.action
+  // Ignore rest of the array cleanup.
+}
+
+coroutine ArrayInitWithCoReturn() {
+  // CHECK-LABEL: define dso_local void @_Z21ArrayInitWithCoReturnv
+  // Verify that we start to emit the array destructor.
+  // CHECK: %arrayinit.endOfInit = alloca ptr, align 8
+  Printy arr[2] = {"a", ({
+                      if (foo()) {
+                        co_return;
+                      }
+                      "b";
+                    })};
+}
diff --git a/clang/test/CodeGenObjC/arc-blocks-exceptions.m b/clang/test/CodeGenObjC/arc-blocks-exceptions.m
index 821b818d4027..54b043d8ea07 100644
--- a/clang/test/CodeGenObjC/arc-blocks-exceptions.m
+++ b/clang/test/CodeGenObjC/arc-blocks-exceptions.m
@@ -5,17 +5,22 @@ void test1(_Bool c) {
   __weak id weakId = 0;
   test1_fn(c ? ^{ (void)weakId; } : 0);
 
-  // CHECK: [[CLEANUP_COND:%.*]] = alloca i1
-  // CHECK-NEXT: [[CLEANUP_SAVE:%.*]] = alloca ptr
+  // CHECK: [[CLEANUP_SAVE:%cond-cleanup.save.*]] = alloca ptr
+  // CHECK-NEXT: [[CLEANUP_COND:%.*]] = alloca i1
+  // CHECK-NEXT: [[CLEANUP_COND1:%.*]] = alloca i1
 
-  // CHECK: store i1 true, ptr [[CLEANUP_COND]]
-  // CHECK-NEXT: store ptr {{.*}}, ptr [[CLEANUP_SAVE]]
+  // CHECK: store i1 false, ptr [[CLEANUP_COND]]
+  // CHECK-NEXT: store i1 false, ptr [[CLEANUP_COND1]]
+
+  // CHECK: store ptr {{.*}}, ptr [[CLEANUP_SAVE]]
+  // CHECK-NEXT: store i1 true, ptr [[CLEANUP_COND]]
+  // CHECK-NEXT: store i1 true, ptr [[CLEANUP_COND1]]
 
   // CHECK: invoke void @test1_fn(
   // CHECK-NEXT: to label %[[INVOKE_CONT:.*]] unwind label %[[LANDING_PAD_LAB:.*]]
 
   // CHECK: [[INVOKE_CONT]]:
-  // CHECK-NEXT: [[LOAD:%.*]] = load i1, ptr [[CLEANUP_COND]]
+  // CHECK-NEXT: [[LOAD:%.*]] = load i1, ptr [[CLEANUP_COND1]]
   // CHECK-NEXT: br i1 [[LOAD]], label %[[END_OF_SCOPE_LAB:.*]], label
 
   // CHECK: [[END_OF_SCOPE_LAB]]:
diff --git a/clang/test/CodeGenObjC/arc-blocks.m b/clang/test/CodeGenObjC/arc-blocks.m
index 105a72b4af1e..f718e8bbf9a6 100644
--- a/clang/test/CodeGenObjC/arc-blocks.m
+++ b/clang/test/CodeGenObjC/arc-blocks.m
@@ -445,8 +445,8 @@ void test13(id x) {
   // CHECK:      [[X:%.*]] = alloca ptr, align 8
   // CHECK-NEXT: [[B:%.*]] = alloca ptr, align 8
   // CHECK-NEXT: [[BLOCK:%.*]] = alloca [[BLOCK_T:.*]], align 8
-  // CHECK-NEXT: [[CLEANUP_ACTIVE:%.*]] = alloca i1
   // CHECK-NEXT: [[COND_CLEANUP_SAVE:%.*]] = alloca ptr,
+  // CHECK-NEXT: [[CLEANUP_ACTIVE:%.*]] = alloca i1
   // CHECK-NEXT: [[T0:%.*]] = call ptr @llvm.objc.retain(ptr {{%.*}})
   // CHECK-NEXT: store ptr [[T0]], ptr [[X]], align 8
   // CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[B]])
@@ -460,8 +460,8 @@ void test13(id x) {
   // CHECK-NEXT: [[T0:%.*]] = load ptr, ptr [[X]], align 8
   // CHECK-NEXT: [[T1:%.*]] = call ptr @llvm.objc.retain(ptr [[T0]])
   // CHECK-NEXT: store ptr [[T1]], ptr [[CAPTURE]], align 8
-  // CHECK-NEXT: store i1 true, ptr [[CLEANUP_ACTIVE]]
   // CHECK-NEXT: store ptr [[CAPTURE]], ptr [[COND_CLEANUP_SAVE]], align 8
+  // CHECK-NEXT: store i1 true, ptr [[CLEANUP_ACTIVE]]
   // CHECK-NEXT: br label
   // CHECK:      br label
   // CHECK:      [[T0:%.*]] = phi ptr
diff --git a/clang/test/Driver/fp-model.c b/clang/test/Driver/fp-model.c
index a464729edb45..9d1245239911 100644
--- a/clang/test/Driver/fp-model.c
+++ b/clang/test/Driver/fp-model.c
@@ -64,7 +64,8 @@
 
 // RUN: %clang -### -ffp-model=strict -fdenormal-fp-math=preserve-sign,preserve-sign -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=WARN10 %s
-// WARN10: warning: overriding '-ffp-model=strict' option with '-fdenormal-fp-math=preserve-sign,preserve-sign' [-Woverriding-option]
+// WARN10: "-cc1"
+// WARN10-NOT: warning: overriding '-ffp-model=strict' option with '-fdenormal-fp-math=preserve-sign,preserve-sign' [-Woverriding-option]
 
 // RUN: %clang -### -ffp-model=fast -ffp-model=strict -c %s 2>&1 | FileCheck \
 // RUN:   --check-prefix=WARN11 %s
@@ -128,6 +129,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-NO-EXCEPT %s
 // RUN: %clang -### -nostdinc -ffp-model=strict -Ofast -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NO-EXCEPT %s
+// CHECK-NO-EXCEPT: "-cc1"
 // CHECK-NO-EXCEPT-NOT: "-ffp-exception-behavior=strict"
 
 // RUN: %clang -### -nostdinc -ffp-exception-behavior=strict -c %s 2>&1 \
diff --git a/clang/test/Driver/linux-ld.c b/clang/test/Driver/linux-ld.c
index e2043ab22afc..28fb075a80db 100644
--- a/clang/test/Driver/linux-ld.c
+++ b/clang/test/Driver/linux-ld.c
@@ -1406,6 +1406,9 @@
 // RUN: %clang --target=x86_64-unknown-linux -no-pie -### %s -funsafe-math-optimizations\
 // RUN:        --sysroot=%S/Inputs/basic_linux_tree 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-CRTFASTMATH %s
+// RUN: %clang --target=x86_64-unknown-linux -no-pie -### %s -ffp-model=fast \
+// RUN:        --sysroot=%S/Inputs/basic_linux_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CRTFASTMATH %s
 // RUN: %clang --target=x86_64-unknown-linux -no-pie -### %s -Ofast\
 // RUN:        --sysroot=%S/Inputs/basic_linux_tree 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-CRTFASTMATH %s
diff --git a/clang/test/Driver/riscv-arch.c b/clang/test/Driver/riscv-arch.c
index 8399b4e97f86..abbe8612b378 100644
--- a/clang/test/Driver/riscv-arch.c
+++ b/clang/test/Driver/riscv-arch.c
@@ -209,7 +209,7 @@
 // RUN: not %clang --target=riscv32-unknown-elf -march=rv32q -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-LETTER %s
 // RV32-LETTER: error: invalid arch name 'rv32q',
-// RV32-LETTER: first letter should be 'e', 'i' or 'g'
+// RV32-LETTER: first letter after 'rv32' should be 'e', 'i' or 'g'
 
 // RUN: not %clang --target=riscv32-unknown-elf -march=rv32imcq -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-ORDER %s
@@ -239,12 +239,12 @@
 // RUN: not %clang --target=riscv32-unknown-elf -march=rv32xabc -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32X %s
 // RV32X: error: invalid arch name 'rv32xabc',
-// RV32X: first letter should be 'e', 'i' or 'g'
+// RV32X: first letter after 'rv32' should be 'e', 'i' or 'g'
 
 // RUN: not %clang --target=riscv32-unknown-elf -march=rv32sabc -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32S %s
 // RV32S: error: invalid arch name 'rv32sabc',
-// RV32S: first letter should be 'e', 'i' or 'g'
+// RV32S: first letter after 'rv32' should be 'e', 'i' or 'g'
 
 // RUN: not %clang --target=riscv32-unknown-elf -march=rv32ix -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32X-NAME %s
diff --git a/clang/test/Driver/solaris-ld.c b/clang/test/Driver/solaris-ld.c
index 6d74389e8922..ce0728d392bf 100644
--- a/clang/test/Driver/solaris-ld.c
+++ b/clang/test/Driver/solaris-ld.c
@@ -193,6 +193,9 @@
 // RUN: %clang --target=sparc-sun-solaris2.11 -### %s -ffast-math \
 // RUN:        --sysroot=%S/Inputs/solaris_sparc_tree 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-CRTFASTMATH-SPARC32 %s
+// RUN: %clang --target=sparc-sun-solaris2.11 -### %s -ffp-model=fast \
+// RUN:        --sysroot=%S/Inputs/solaris_sparc_tree 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CRTFASTMATH-SPARC32 %s
 // CHECK-CRTFASTMATH-SPARC32: "-isysroot" "[[SYSROOT:[^"]+]]"
 // CHECK-CRTFASTMATH-SPARC32: "[[SYSROOT]]/usr/gcc/4.8/lib/gcc/sparc-sun-solaris2.11/4.8.2{{/|\\\\}}crtfastmath.o"
 // CHECK-NOCRTFASTMATH-SPARC32-NOT: crtfastmath.o
diff --git a/clang/test/Parser/pragma-unroll.cpp b/clang/test/Parser/pragma-unroll.cpp
index f41bd7a18d5a..19066acddcef 100644
--- a/clang/test/Parser/pragma-unroll.cpp
+++ b/clang/test/Parser/pragma-unroll.cpp
@@ -124,3 +124,32 @@ void test(int *List, int Length) {
 
 #pragma unroll
 /* expected-error {{expected statement}} */ }
+
+using size_t = unsigned long long;
+
+template <bool Flag>
+int FailToBuild(int n) {
+  constexpr int N = 100;
+  auto init = [=]() { return Flag ? n : 0UL; };
+  auto cond = [=](size_t ix) { return Flag ? ix != 0 : ix < 10; };
+  auto iter = [=](size_t ix) {
+    return Flag ? ix & ~(1ULL << __builtin_clzll(ix)) : ix + 1;
+  };
+#pragma unroll Flag ? 0 : N // Ok, allow 0.
+  for (size_t ix = init(); cond(ix); ix = iter(ix)) {
+    n *= n;
+  }
+#pragma GCC unroll Flag ? 0 : N // Ok, allow 0.
+  for (size_t ix = init(); cond(ix); ix = iter(ix)) {
+    n *= n;
+  }
+  return n;
+}
+
+int foo(int n) {
+    return FailToBuild<true>(n);
+}
+
+int bar(int n) {
+    return FailToBuild<false>(n);
+}
diff --git a/clang/test/Preprocessor/wasm-target-features.c b/clang/test/Preprocessor/wasm-target-features.c
index 32e24ad1b716..19bd918543df 100644
--- a/clang/test/Preprocessor/wasm-target-features.c
+++ b/clang/test/Preprocessor/wasm-target-features.c
@@ -152,7 +152,9 @@
 // RUN:     -target wasm64-unknown-unknown -mcpu=generic \
 // RUN:   | FileCheck %s -check-prefix=GENERIC-INCLUDE
 //
+// GENERIC-INCLUDE-DAG: #define __wasm_multivalue__ 1{{$}}
 // GENERIC-INCLUDE-DAG: #define __wasm_mutable_globals__ 1{{$}}
+// GENERIC-INCLUDE-DAG: #define __wasm_reference_types__ 1{{$}}
 // GENERIC-INCLUDE-DAG: #define __wasm_sign_ext__ 1{{$}}
 //
 // RUN: %clang -E -dM %s -o - 2>&1 \
@@ -167,9 +169,7 @@
 // GENERIC-NOT: #define __wasm_exception_handling__ 1{{$}}
 // GENERIC-NOT: #define __wasm_extended_const__ 1{{$}}
 // GENERIC-NOT: #define __wasm_multimemory__ 1{{$}}
-// GENERIC-NOT: #define __wasm_multivalue__ 1{{$}}
 // GENERIC-NOT: #define __wasm_nontrapping_fptoint__ 1{{$}}
-// GENERIC-NOT: #define __wasm_reference_types__ 1{{$}}
 // GENERIC-NOT: #define __wasm_relaxed_simd__ 1{{$}}
 // GENERIC-NOT: #define __wasm_simd128__ 1{{$}}
 // GENERIC-NOT: #define __wasm_tail_call__ 1{{$}}
@@ -184,6 +184,7 @@
 // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_atomics__ 1{{$}}
 // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_bulk_memory__ 1{{$}}
 // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_multimemory__ 1{{$}}
+// BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_multivalue__ 1{{$}}
 // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_mutable_globals__ 1{{$}}
 // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_nontrapping_fptoint__ 1{{$}}
 // BLEEDING-EDGE-INCLUDE-DAG: #define __wasm_reference_types__ 1{{$}}
@@ -200,7 +201,6 @@
 //
 // BLEEDING-EDGE-NOT: #define __wasm_exception_handling__ 1{{$}}
 // BLEEDING-EDGE-NOT: #define __wasm_extended_const__ 1{{$}}
-// BLEEDING-EDGE-NOT: #define __wasm_multivalue__ 1{{$}}
 // BLEEDING-EDGE-NOT: #define __wasm_relaxed_simd__ 1{{$}}
 
 // RUN: %clang -E -dM %s -o - 2>&1 \
diff --git a/clang/test/Sema/constant_builtins_vector.cpp b/clang/test/Sema/constant_builtins_vector.cpp
new file mode 100644
index 000000000000..ddb78696ce62
--- /dev/null
+++ b/clang/test/Sema/constant_builtins_vector.cpp
@@ -0,0 +1,725 @@
+// RUN: %clang_cc1 -verify -std=c++2a -fsyntax-only -Wno-bit-int-extension %s
+// RUN: %clang_cc1 -verify -std=c++2a -fsyntax-only -Wno-bit-int-extension -triple ppc64-unknown-linux %s
+// RUN: %clang_cc1 -verify -std=c++2a -fsyntax-only -Wno-bit-int-extension -triple ppc64le-unknown-linux %s
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define LITTLE_END 1
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define LITTLE_END 0
+#else
+#error "huh?"
+#endif
+
+// We also support _BitInt as long as it is >=8 and a power of 2.
+typedef _BitInt(8) BitInt8;
+typedef _BitInt(32) BitInt32;
+typedef _BitInt(128) BitInt128;
+
+typedef double vector4double __attribute__((__vector_size__(32)));
+typedef float vector4float __attribute__((__vector_size__(16)));
+typedef long long vector4long __attribute__((__vector_size__(32)));
+typedef int vector4int __attribute__((__vector_size__(16)));
+typedef short vector4short __attribute__((__vector_size__(8)));
+typedef char vector4char __attribute__((__vector_size__(4)));
+typedef BitInt8 vector4BitInt8 __attribute__((__vector_size__(4)));
+typedef BitInt32 vector4BitInt32 __attribute__((__vector_size__(16)));
+typedef BitInt128 vector4BitInt128 __attribute__((__vector_size__(64)));
+typedef double vector8double __attribute__((__vector_size__(64)));
+typedef float vector8float __attribute__((__vector_size__(32)));
+typedef long long vector8long __attribute__((__vector_size__(64)));
+typedef int vector8int __attribute__((__vector_size__(32)));
+typedef short vector8short __attribute__((__vector_size__(16)));
+typedef char vector8char __attribute__((__vector_size__(8)));
+typedef BitInt8 vector8BitInt8 __attribute__((__vector_size__(8)));
+typedef BitInt32 vector8BitInt32 __attribute__((__vector_size__(32)));
+typedef BitInt128 vector8BitInt128 __attribute__((__vector_size__(128)));
+
+#define CHECK_NUM(__size, __typeFrom, __typeTo, ...)                            \
+  constexpr vector##__size##__typeTo                                            \
+      from_##vector##__size##__typeFrom##_to_##vector##__size##__typeTo##_var = \
+          __builtin_convertvector((vector##__size##__typeFrom){__VA_ARGS__},    \
+                                  vector##__size##__typeTo);
+#define CHECK_TO_ALL_TYPES(__size, __typeFrom, ...)                            \
+  CHECK_NUM(__size, __typeFrom, double, __VA_ARGS__)                           \
+  CHECK_NUM(__size, __typeFrom, float, __VA_ARGS__)                            \
+  CHECK_NUM(__size, __typeFrom, long, __VA_ARGS__)                             \
+  CHECK_NUM(__size, __typeFrom, int, __VA_ARGS__)                              \
+  CHECK_NUM(__size, __typeFrom, short, __VA_ARGS__)                            \
+  CHECK_NUM(__size, __typeFrom, char, __VA_ARGS__)                             \
+  CHECK_NUM(__size, __typeFrom, BitInt8, __VA_ARGS__)                          \
+  CHECK_NUM(__size, __typeFrom, BitInt32, __VA_ARGS__)                         \
+  CHECK_NUM(__size, __typeFrom, BitInt128, __VA_ARGS__)                        \
+  static_assert(                                                               \
+      __builtin_bit_cast(                                                      \
+          unsigned,                                                            \
+          __builtin_shufflevector(                                             \
+              from_vector##__size##__typeFrom##_to_vector##__size##char_var,   \
+              from_vector##__size##__typeFrom##_to_vector##__size##char_var,   \
+              0, 1, 2, 3)) == (LITTLE_END ? 0x03020100 : 0x00010203));         \
+  static_assert(                                                               \
+      __builtin_bit_cast(                                                      \
+          unsigned long long,                                                  \
+          __builtin_shufflevector(                                             \
+              from_vector##__size##__typeFrom##_to_vector##__size##short_var,  \
+              from_vector##__size##__typeFrom##_to_vector##__size##short_var,  \
+              0, 1, 2, 3)) ==                                                  \
+      (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+
+#define CHECK_ALL_COMBINATIONS(__size, ...)                                    \
+  CHECK_TO_ALL_TYPES(__size, double, __VA_ARGS__)                              \
+  CHECK_TO_ALL_TYPES(__size, float, __VA_ARGS__)                               \
+  CHECK_TO_ALL_TYPES(__size, long, __VA_ARGS__)                                \
+  CHECK_TO_ALL_TYPES(__size, int, __VA_ARGS__)                                 \
+  CHECK_TO_ALL_TYPES(__size, short, __VA_ARGS__)                               \
+  CHECK_TO_ALL_TYPES(__size, char, __VA_ARGS__)                                \
+  CHECK_TO_ALL_TYPES(__size, BitInt8, __VA_ARGS__)                             \
+  CHECK_TO_ALL_TYPES(__size, BitInt32, __VA_ARGS__)                            \
+  CHECK_TO_ALL_TYPES(__size, BitInt128, __VA_ARGS__)
+
+// The result below is expanded from these macros. Use them to autogenerate the
+// test cases below.
+// CHECK_ALL_COMBINATIONS(4, 0, 1, 2, 3);
+// CHECK_ALL_COMBINATIONS(8, 0, 1, 2, 3, 4, 5, 6, 7);
+
+constexpr vector4double from_vector4double_to_vector4double_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4double_to_vector4float_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4double_to_vector4long_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4double_to_vector4int_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4double_to_vector4short_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4double_to_vector4char_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4double_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4double_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4double_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(
+                  unsigned,
+                  __builtin_shufflevector(from_vector4double_to_vector4char_var,
+                                          from_vector4double_to_vector4char_var,
+                                          0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(unsigned long long,
+                                 __builtin_shufflevector(
+                                     from_vector4double_to_vector4short_var,
+                                     from_vector4double_to_vector4short_var, 0,
+                                     1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector4double from_vector4float_to_vector4double_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4float_to_vector4float_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4float_to_vector4long_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4float_to_vector4int_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4float_to_vector4short_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4float_to_vector4char_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4float_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4float_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4float_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector4float_to_vector4char_var,
+                                     from_vector4float_to_vector4char_var, 0, 1,
+                                     2, 3)) == (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector4float_to_vector4short_var,
+                                          from_vector4float_to_vector4short_var,
+                                          0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector4double from_vector4long_to_vector4double_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4long_to_vector4float_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4long_to_vector4long_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4long_to_vector4int_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4long_to_vector4short_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4long_to_vector4char_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4long_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4long_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4long_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector4long_to_vector4char_var,
+                                     from_vector4long_to_vector4char_var, 0, 1,
+                                     2, 3)) == (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector4long_to_vector4short_var,
+                                          from_vector4long_to_vector4short_var,
+                                          0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector4double from_vector4int_to_vector4double_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4int_to_vector4float_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4int_to_vector4long_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4int_to_vector4int_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4int_to_vector4short_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4int_to_vector4char_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4int_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4int_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4int_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector4int_to_vector4char_var,
+                                     from_vector4int_to_vector4char_var, 0, 1,
+                                     2, 3)) == (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector4int_to_vector4short_var,
+                                          from_vector4int_to_vector4short_var,
+                                          0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector4double from_vector4short_to_vector4double_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4short_to_vector4float_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4short_to_vector4long_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4short_to_vector4int_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4short_to_vector4short_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4short_to_vector4char_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4short_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4short_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4short_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector4short_to_vector4char_var,
+                                     from_vector4short_to_vector4char_var, 0, 1,
+                                     2, 3)) == (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector4short_to_vector4short_var,
+                                          from_vector4short_to_vector4short_var,
+                                          0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector4double from_vector4char_to_vector4double_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4char_to_vector4float_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4char_to_vector4long_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4char_to_vector4int_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4char_to_vector4short_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4char_to_vector4char_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4char_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4char_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4char_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector4char_to_vector4char_var,
+                                     from_vector4char_to_vector4char_var, 0, 1,
+                                     2, 3)) == (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector4char_to_vector4short_var,
+                                          from_vector4char_to_vector4short_var,
+                                          0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector4double from_vector4BitInt8_to_vector4double_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4BitInt8_to_vector4float_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4BitInt8_to_vector4long_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4BitInt8_to_vector4int_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4BitInt8_to_vector4short_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4BitInt8_to_vector4char_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4BitInt8_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4BitInt8_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4BitInt8_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector4BitInt8_to_vector4char_var,
+                                     from_vector4BitInt8_to_vector4char_var, 0,
+                                     1, 2, 3)) ==
+              (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(unsigned long long,
+                                 __builtin_shufflevector(
+                                     from_vector4BitInt8_to_vector4short_var,
+                                     from_vector4BitInt8_to_vector4short_var, 0,
+                                     1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector4double from_vector4BitInt32_to_vector4double_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4BitInt32_to_vector4float_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4BitInt32_to_vector4long_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4BitInt32_to_vector4int_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4BitInt32_to_vector4short_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4BitInt32_to_vector4char_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4BitInt32_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4BitInt32_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4BitInt32_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector4BitInt32_to_vector4char_var,
+                                     from_vector4BitInt32_to_vector4char_var, 0,
+                                     1, 2, 3)) ==
+              (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(unsigned long long,
+                                 __builtin_shufflevector(
+                                     from_vector4BitInt32_to_vector4short_var,
+                                     from_vector4BitInt32_to_vector4short_var,
+                                     0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector4double from_vector4BitInt128_to_vector4double_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4BitInt128_to_vector4float_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4BitInt128_to_vector4long_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4BitInt128_to_vector4int_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4BitInt128_to_vector4short_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4BitInt128_to_vector4char_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4BitInt128_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4BitInt128_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4BitInt128_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector4BitInt128_to_vector4char_var,
+                                     from_vector4BitInt128_to_vector4char_var,
+                                     0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(unsigned long long,
+                                 __builtin_shufflevector(
+                                     from_vector4BitInt128_to_vector4short_var,
+                                     from_vector4BitInt128_to_vector4short_var,
+                                     0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+;
+constexpr vector8double from_vector8double_to_vector8double_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8double_to_vector8float_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8float);
+constexpr vector8long from_vector8double_to_vector8long_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8long);
+constexpr vector8int from_vector8double_to_vector8int_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8int);
+constexpr vector8short from_vector8double_to_vector8short_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8short);
+constexpr vector8char from_vector8double_to_vector8char_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8char);
+constexpr vector8BitInt8 from_vector8double_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8double_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8double_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(
+                  unsigned,
+                  __builtin_shufflevector(from_vector8double_to_vector8char_var,
+                                          from_vector8double_to_vector8char_var,
+                                          0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(unsigned long long,
+                                 __builtin_shufflevector(
+                                     from_vector8double_to_vector8short_var,
+                                     from_vector8double_to_vector8short_var, 0,
+                                     1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector8double from_vector8float_to_vector8double_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8float_to_vector8float_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8float);
+constexpr vector8long from_vector8float_to_vector8long_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8long);
+constexpr vector8int from_vector8float_to_vector8int_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+constexpr vector8short from_vector8float_to_vector8short_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8short);
+constexpr vector8char from_vector8float_to_vector8char_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8char);
+constexpr vector8BitInt8 from_vector8float_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8float_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8float_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector8float_to_vector8char_var,
+                                     from_vector8float_to_vector8char_var, 0, 1,
+                                     2, 3)) == (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector8float_to_vector8short_var,
+                                          from_vector8float_to_vector8short_var,
+                                          0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector8double from_vector8long_to_vector8double_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8long_to_vector8float_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8float);
+constexpr vector8long from_vector8long_to_vector8long_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7}, vector8long);
+constexpr vector8int from_vector8long_to_vector8int_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+constexpr vector8short from_vector8long_to_vector8short_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8short);
+constexpr vector8char from_vector8long_to_vector8char_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7}, vector8char);
+constexpr vector8BitInt8 from_vector8long_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8long_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8long_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector8long_to_vector8char_var,
+                                     from_vector8long_to_vector8char_var, 0, 1,
+                                     2, 3)) == (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector8long_to_vector8short_var,
+                                          from_vector8long_to_vector8short_var,
+                                          0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector8double from_vector8int_to_vector8double_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8int_to_vector8float_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8float);
+constexpr vector8long from_vector8int_to_vector8long_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8long);
+constexpr vector8int from_vector8int_to_vector8int_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+constexpr vector8short from_vector8int_to_vector8short_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8short);
+constexpr vector8char from_vector8int_to_vector8char_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8char);
+constexpr vector8BitInt8 from_vector8int_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8int_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8int_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector8int_to_vector8char_var,
+                                     from_vector8int_to_vector8char_var, 0, 1,
+                                     2, 3)) == (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector8int_to_vector8short_var,
+                                          from_vector8int_to_vector8short_var,
+                                          0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector8double from_vector8short_to_vector8double_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8short_to_vector8float_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8float);
+constexpr vector8long from_vector8short_to_vector8long_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8long);
+constexpr vector8int from_vector8short_to_vector8int_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+constexpr vector8short from_vector8short_to_vector8short_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8short);
+constexpr vector8char from_vector8short_to_vector8char_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8char);
+constexpr vector8BitInt8 from_vector8short_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8short_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8short_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector8short_to_vector8char_var,
+                                     from_vector8short_to_vector8char_var, 0, 1,
+                                     2, 3)) == (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector8short_to_vector8short_var,
+                                          from_vector8short_to_vector8short_var,
+                                          0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector8double from_vector8char_to_vector8double_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8char_to_vector8float_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8float);
+constexpr vector8long from_vector8char_to_vector8long_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7}, vector8long);
+constexpr vector8int from_vector8char_to_vector8int_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+constexpr vector8short from_vector8char_to_vector8short_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8short);
+constexpr vector8char from_vector8char_to_vector8char_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7}, vector8char);
+constexpr vector8BitInt8 from_vector8char_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8char_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8char_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector8char_to_vector8char_var,
+                                     from_vector8char_to_vector8char_var, 0, 1,
+                                     2, 3)) == (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector8char_to_vector8short_var,
+                                          from_vector8char_to_vector8short_var,
+                                          0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector8double from_vector8BitInt8_to_vector8double_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8BitInt8_to_vector8float_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8float);
+constexpr vector8long from_vector8BitInt8_to_vector8long_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8long);
+constexpr vector8int from_vector8BitInt8_to_vector8int_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8int);
+constexpr vector8short from_vector8BitInt8_to_vector8short_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8short);
+constexpr vector8char from_vector8BitInt8_to_vector8char_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8char);
+constexpr vector8BitInt8 from_vector8BitInt8_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8BitInt8_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8BitInt8_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector8BitInt8_to_vector8char_var,
+                                     from_vector8BitInt8_to_vector8char_var, 0,
+                                     1, 2, 3)) ==
+              (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(unsigned long long,
+                                 __builtin_shufflevector(
+                                     from_vector8BitInt8_to_vector8short_var,
+                                     from_vector8BitInt8_to_vector8short_var, 0,
+                                     1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector8double from_vector8BitInt32_to_vector8double_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8BitInt32_to_vector8float_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8float);
+constexpr vector8long from_vector8BitInt32_to_vector8long_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8long);
+constexpr vector8int from_vector8BitInt32_to_vector8int_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8int);
+constexpr vector8short from_vector8BitInt32_to_vector8short_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8short);
+constexpr vector8char from_vector8BitInt32_to_vector8char_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8char);
+constexpr vector8BitInt8 from_vector8BitInt32_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8BitInt32_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8BitInt32_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector8BitInt32_to_vector8char_var,
+                                     from_vector8BitInt32_to_vector8char_var, 0,
+                                     1, 2, 3)) ==
+              (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(unsigned long long,
+                                 __builtin_shufflevector(
+                                     from_vector8BitInt32_to_vector8short_var,
+                                     from_vector8BitInt32_to_vector8short_var,
+                                     0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector8double from_vector8BitInt128_to_vector8double_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8BitInt128_to_vector8float_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8float);
+constexpr vector8long from_vector8BitInt128_to_vector8long_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8long);
+constexpr vector8int from_vector8BitInt128_to_vector8int_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8int);
+constexpr vector8short from_vector8BitInt128_to_vector8short_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8short);
+constexpr vector8char from_vector8BitInt128_to_vector8char_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8char);
+constexpr vector8BitInt8 from_vector8BitInt128_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8BitInt128_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8BitInt128_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector8BitInt128_to_vector8char_var,
+                                     from_vector8BitInt128_to_vector8char_var,
+                                     0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(unsigned long long,
+                                 __builtin_shufflevector(
+                                     from_vector8BitInt128_to_vector8short_var,
+                                     from_vector8BitInt128_to_vector8short_var,
+                                     0, 1, 2, 3)) ==
+              (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+;
+#undef CHECK_ALL_COMBINATIONS
+#undef CHECK_TO_ALL_TYPES
+#undef CHECK_NUM
+
+// Shuffle vector
+constexpr vector4char vector4charConst1 = {0, 1, 2, 3};
+constexpr vector4char vector4charConst2 = {4, 5, 6, 7};
+constexpr vector8char vector8intConst = {8, 9, 10, 11, 12, 13, 14, 15};
+
+constexpr vector4char vectorShuffle1 =
+    __builtin_shufflevector(vector4charConst1, vector4charConst2, 0, 1, 2, 3);
+static_assert(__builtin_bit_cast(unsigned, vectorShuffle1) ==
+              (LITTLE_END ? 0x03020100 : 0x00010203));
+constexpr vector4char vectorShuffle2 =
+    __builtin_shufflevector(vector4charConst1, vector4charConst2, 4, 5, 6, 7);
+static_assert(__builtin_bit_cast(unsigned, vectorShuffle2) ==
+              (LITTLE_END ? 0x07060504 : 0x04050607));
+constexpr vector4char vectorShuffle3 =
+    __builtin_shufflevector(vector4charConst1, vector4charConst2, 0, 2, 4, 6);
+static_assert(__builtin_bit_cast(unsigned, vectorShuffle3) ==
+              (LITTLE_END ? 0x06040200 : 0x00020406));
+constexpr vector8char vectorShuffle4 = __builtin_shufflevector(
+    vector8intConst, vector8intConst, 0, 2, 4, 6, 8, 10, 12, 14);
+static_assert(__builtin_bit_cast(unsigned long long, vectorShuffle4) ==
+              (LITTLE_END ? 0x0E0C0A080E0C0A08 : 0x080A0C0E080A0C0E));
+constexpr vector4char vectorShuffle5 =
+    __builtin_shufflevector(vector8intConst, vector8intConst, 0, 2, 4, 6);
+static_assert(__builtin_bit_cast(unsigned, vectorShuffle5) ==
+              (LITTLE_END ? 0x0E0C0A08 : 0x080A0C0E));
+constexpr vector8char vectorShuffle6 = __builtin_shufflevector(
+    vector4charConst1, vector4charConst2, 0, 2, 4, 6, 1, 3, 5, 7);
+static_assert(__builtin_bit_cast(unsigned long long, vectorShuffle6) ==
+              (LITTLE_END ? 0x0705030106040200 : 0x0002040601030507));
+
+constexpr vector4char
+    vectorShuffleFail1 = // expected-error {{constexpr variable 'vectorShuffleFail1'\
+ must be initialized by a constant expression}}
+    __builtin_shufflevector( // expected-error {{index for __builtin_shufflevector \
+not within the bounds of the input vectors; index of -1 found at position 0 not \
+permitted in a constexpr context.}}
+        vector4charConst1,
+        vector4charConst2, -1, -1, -1, -1);
diff --git a/clang/test/Sema/convertvector.c b/clang/test/Sema/convertvector.c
index 8ae43c3ba3d4..1ff04af90981 100644
--- a/clang/test/Sema/convertvector.c
+++ b/clang/test/Sema/convertvector.c
@@ -15,3 +15,6 @@ vector8float foo3(double x) {
   return __builtin_convertvector(x, vector8float);  // expected-error {{must be a vector}}
 }
 
+float foo4(float x) {
+  return __builtin_convertvector(x, float); // expected-error {{first argument to __builtin_convertvector must be a vector}}
+}
diff --git a/clang/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp b/clang/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp
index 2f067ea53a50..90404f115c75 100644
--- a/clang/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp
+++ b/clang/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp
@@ -12,14 +12,19 @@ namespace std {
     size_t n;
     initializer_list();
   };
-  // FIXME: This should probably not be necessary.
-  template<typename T> initializer_list(initializer_list<T>) -> initializer_list<T>;
 }
 
 template<typename T> constexpr bool has_type(...) { return false; }
 template<typename T> constexpr bool has_type(T&) { return true; }
 
-std::initializer_list il = {1, 2, 3, 4, 5};
+std::initializer_list il1 = {1, 2, 3, 4, 5};
+auto il2 = std::initializer_list{1, 2, 3, 4};
+auto il3 = std::initializer_list{il1};
+auto il4 = std::initializer_list{il1, il1, il1};
+static_assert(has_type<std::initializer_list<int>>(il1));
+static_assert(has_type<std::initializer_list<int>>(il2));
+static_assert(has_type<std::initializer_list<int>>(il3));
+static_assert(has_type<std::initializer_list<std::initializer_list<int>>>(il4));
 
 template<typename T> struct vector {
   template<typename Iter> vector(Iter, Iter);
diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp
index dee4a29bd2bf..01991887b284 100644
--- a/clang/test/SemaCXX/type-traits.cpp
+++ b/clang/test/SemaCXX/type-traits.cpp
@@ -2509,6 +2509,20 @@ void is_convertible()
   static_assert(__is_convertible(FloatWrapper, IntWrapper));
   static_assert(__is_convertible(FloatWrapper, float));
   static_assert(__is_convertible(float, FloatWrapper));
+  static_assert(__is_convertible(IntWrapper, IntWrapper&&));
+  static_assert(__is_convertible(IntWrapper, const IntWrapper&));
+  static_assert(__is_convertible(IntWrapper, int&&));
+  static_assert(__is_convertible(IntWrapper, const int&));
+  static_assert(__is_convertible(int, IntWrapper&&));
+  static_assert(__is_convertible(int, const IntWrapper&));
+  static_assert(__is_convertible(IntWrapper, FloatWrapper&&));
+  static_assert(__is_convertible(IntWrapper, const FloatWrapper&));
+  static_assert(__is_convertible(FloatWrapper, IntWrapper&&));
+  static_assert(__is_convertible(FloatWrapper, const IntWrapper&&));
+  static_assert(__is_convertible(FloatWrapper, float&&));
+  static_assert(__is_convertible(FloatWrapper, const float&));
+  static_assert(__is_convertible(float, FloatWrapper&&));
+  static_assert(__is_convertible(float, const FloatWrapper&));
 }
 
 void is_nothrow_convertible()
@@ -2521,6 +2535,20 @@ void is_nothrow_convertible()
   static_assert(!__is_nothrow_convertible(FloatWrapper, IntWrapper));
   static_assert(!__is_nothrow_convertible(FloatWrapper, float));
   static_assert(__is_nothrow_convertible(float, FloatWrapper));
+  static_assert(__is_nothrow_convertible(IntWrapper, IntWrapper&&));
+  static_assert(__is_nothrow_convertible(IntWrapper, const IntWrapper&));
+  static_assert(__is_nothrow_convertible(IntWrapper, int&&));
+  static_assert(__is_nothrow_convertible(IntWrapper, const int&));
+  static_assert(!__is_nothrow_convertible(int, IntWrapper&&));
+  static_assert(!__is_nothrow_convertible(int, const IntWrapper&));
+  static_assert(!__is_nothrow_convertible(IntWrapper, FloatWrapper&&));
+  static_assert(!__is_nothrow_convertible(IntWrapper, const FloatWrapper&));
+  static_assert(!__is_nothrow_convertible(FloatWrapper, IntWrapper&&));
+  static_assert(!__is_nothrow_convertible(FloatWrapper, const IntWrapper&));
+  static_assert(!__is_nothrow_convertible(FloatWrapper, float&&));
+  static_assert(!__is_nothrow_convertible(FloatWrapper, const float&));
+  static_assert(__is_nothrow_convertible(float, FloatWrapper&&));
+  static_assert(__is_nothrow_convertible(float, const FloatWrapper&));
 }
 
 struct FromInt { FromInt(int); };
diff --git a/clang/test/SemaObjC/format-strings-oslog.m b/clang/test/SemaObjC/format-strings-oslog.m
index 20fec93b653b..af5aef3d6179 100644
--- a/clang/test/SemaObjC/format-strings-oslog.m
+++ b/clang/test/SemaObjC/format-strings-oslog.m
@@ -44,15 +44,18 @@ void test_os_log_format(const char *pc, int i, void *p, void *buf) {
 }
 
 // Test os_log_format primitive with ObjC string literal format argument.
-void test_objc(const char *pc, int i, void *p, void *buf, NSString *nss) {
+void test_objc(const char *pc, int i, void *p, void *buf, NSString *nss, id obj) {
   __builtin_os_log_format(buf, @"");
   __builtin_os_log_format(buf, @"%d"); // expected-warning {{more '%' conversions than data arguments}}
   __builtin_os_log_format(buf, @"%d", i);
+
   __builtin_os_log_format(buf, @"%P", p); // expected-warning {{using '%P' format specifier without precision}}
   __builtin_os_log_format(buf, @"%.10P", p);
   __builtin_os_log_format(buf, @"%.*P", p); // expected-warning {{field precision should have type 'int', but argument has type 'void *'}}
   __builtin_os_log_format(buf, @"%.*P", i, p);
   __builtin_os_log_format(buf, @"%.*P", i, i); // expected-warning {{format specifies type 'void *' but the argument has type 'int'}}
+  __builtin_os_log_format(buf, @"%.8P", nss);    // expected-warning {{using '%P' format specifier with an Objective-C pointer results in dumping runtime object structure, not object value}}
+  __builtin_os_log_format(buf, @"%.*P", i, obj); // expected-warning {{using '%P' format specifier with an Objective-C pointer results in dumping runtime object structure, not object value}}
 
   __builtin_os_log_format(buf, @"%{private}s", pc);
   __builtin_os_log_format(buf, @"%@", nss);
diff --git a/clang/test/SemaOpenCL/vec_step.cl b/clang/test/SemaOpenCL/vec_step.cl
index afb6dc94d92e..c116f09b351f 100644
--- a/clang/test/SemaOpenCL/vec_step.cl
+++ b/clang/test/SemaOpenCL/vec_step.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -pedantic -verify %s
+// RUN: %clang_cc1 -fsyntax-only -pedantic -verify %s -fexperimental-new-constant-interpreter
 
 typedef int int2 __attribute__((ext_vector_type(2)));
 typedef int int3 __attribute__((ext_vector_type(3)));
diff --git a/clang/test/Unit/lit.cfg.py b/clang/test/Unit/lit.cfg.py
index 475069e630d7..37e91d0f8629 100644
--- a/clang/test/Unit/lit.cfg.py
+++ b/clang/test/Unit/lit.cfg.py
@@ -25,13 +25,9 @@ config.test_format = lit.formats.GoogleTest(config.llvm_build_mode, "Tests")
 
 # Propagate the temp directory. Windows requires this because it uses \Windows\
 # if none of these are present.
-if "TMP" in os.environ:
-    config.environment["TMP"] = os.environ["TMP"]
-if "TEMP" in os.environ:
-    config.environment["TEMP"] = os.environ["TEMP"]
-
-if "HOME" in os.environ:
-    config.environment["HOME"] = os.environ["HOME"]
+for v in ["TMP", "TEMP", "HOME", "SystemDrive"]:
+    if v in os.environ:
+        config.environment[v] = os.environ[v]
 
 # Propagate sanitizer options.
 for var in [
diff --git a/clang/utils/creduce-clang-crash.py b/clang/utils/creduce-clang-crash.py
index 27361bb88505..4d0c8224d8b4 100755
--- a/clang/utils/creduce-clang-crash.py
+++ b/clang/utils/creduce-clang-crash.py
@@ -15,7 +15,6 @@ import shutil
 import stat
 import sys
 import subprocess
-import pipes
 import shlex
 import tempfile
 import shutil
@@ -61,7 +60,7 @@ def check_cmd(cmd_name, cmd_dir, cmd_path=None):
 
 
 def quote_cmd(cmd):
-    return " ".join(pipes.quote(arg) for arg in cmd)
+    return " ".join(shlex.quote(arg) for arg in cmd)
 
 
 def write_to_script(text, filename):
@@ -220,7 +219,7 @@ fi
         )
 
         for msg in self.expected_output:
-            output += "grep -F %s t.log || exit 1\n" % pipes.quote(msg)
+            output += "grep -F %s t.log || exit 1\n" % shlex.quote(msg)
 
         write_to_script(output, self.testfile)
         self.check_interestingness()
@@ -318,9 +317,17 @@ fi
         interestingness test takes to run.
         """
         print("\nSimplifying the clang command...")
+        new_args = self.clang_args
+
+        # Remove the color diagnostics flag to make it easier to match error
+        # text.
+        new_args = self.try_remove_args(
+            new_args,
+            msg="Removed -fcolor-diagnostics",
+            opts_equal=["-fcolor-diagnostics"],
+        )
 
         # Remove some clang arguments to speed up the interestingness test
-        new_args = self.clang_args
         new_args = self.try_remove_args(
             new_args,
             msg="Removed debug info options",
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index ea8872c91be6..875521bd505d 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -1435,7 +1435,7 @@ accessible?</td>
   </tr>
   <tr class="open" id="233">
     <td><a href="https://cplusplus.github.io/CWG/issues/233.html">233</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>References vs pointers in UDC overload resolution</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -2756,7 +2756,7 @@ of class templates</td>
   </tr>
   <tr id="453">
     <td><a href="https://cplusplus.github.io/CWG/issues/453.html">453</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>References may only bind to &#8220;valid&#8221; objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -5812,7 +5812,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="1001">
     <td><a href="https://cplusplus.github.io/CWG/issues/1001.html">1001</a></td>
-    <td>drafting</td>
+    <td>review</td>
     <td>Parameter type adjustment in dependent parameter types</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -6034,7 +6034,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="1038">
     <td><a href="https://cplusplus.github.io/CWG/issues/1038.html">1038</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Overload resolution of <TT>&amp;x.static_func</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -9994,7 +9994,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="1698">
     <td><a href="https://cplusplus.github.io/CWG/issues/1698.html">1698</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Files ending in <TT>\</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -10132,7 +10132,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="1721">
     <td><a href="https://cplusplus.github.io/CWG/issues/1721.html">1721</a></td>
-    <td>drafting</td>
+    <td>review</td>
     <td>Diagnosing ODR violations for static data members</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -11312,11 +11312,11 @@ and <I>POD class</I></td>
     <td>decltype-qualified enumeration names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="1918">
+  <tr id="1918">
     <td><a href="https://cplusplus.github.io/CWG/issues/1918.html">1918</a></td>
-    <td>open</td>
+    <td>CD5</td>
     <td><TT>friend</TT> templates with dependent scopes</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1919">
     <td><a href="https://cplusplus.github.io/CWG/issues/1919.html">1919</a></td>
@@ -11474,11 +11474,11 @@ and <I>POD class</I></td>
     <td>New C incompatibilities</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="1945">
+  <tr id="1945">
     <td><a href="https://cplusplus.github.io/CWG/issues/1945.html">1945</a></td>
-    <td>open</td>
+    <td>CD5</td>
     <td>Friend declarations naming members of class templates in non-templates</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1946">
     <td><a href="https://cplusplus.github.io/CWG/issues/1946.html">1946</a></td>
@@ -11530,7 +11530,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="1954">
     <td><a href="https://cplusplus.github.io/CWG/issues/1954.html">1954</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td><TT>typeid</TT> null dereference check in subexpressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -12098,11 +12098,11 @@ and <I>POD class</I></td>
     <td>C-style casts that cast away constness vs <TT>static_cast</TT></td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2049">
+  <tr id="2049">
     <td><a href="https://cplusplus.github.io/CWG/issues/2049.html">2049</a></td>
-    <td>drafting</td>
+    <td>DRWP</td>
     <td>List initializer in non-type template default argument</td>
-    <td title="Clang 18 implements P2308R1 resolution" align="center">Not Resolved*</td>
+    <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2050">
     <td><a href="https://cplusplus.github.io/CWG/issues/2050.html">2050</a></td>
@@ -12130,7 +12130,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2054">
     <td><a href="https://cplusplus.github.io/CWG/issues/2054.html">2054</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Missing description of class SFINAE</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -12418,7 +12418,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2102">
     <td><a href="https://cplusplus.github.io/CWG/issues/2102.html">2102</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Constructor checking in <I>new-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -12698,11 +12698,11 @@ and <I>POD class</I></td>
     <td>Thread storage duration and order of initialization</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2149">
+  <tr id="2149">
     <td><a href="https://cplusplus.github.io/CWG/issues/2149.html">2149</a></td>
-    <td>drafting</td>
+    <td>DR</td>
     <td>Brace elision and array length deduction</td>
-    <td title="Clang 3.1 implements 2024-04 resolution" align="center">Not Resolved*</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="2150">
     <td><a href="https://cplusplus.github.io/CWG/issues/2150.html">2150</a></td>
@@ -13318,7 +13318,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2252">
     <td><a href="https://cplusplus.github.io/CWG/issues/2252.html">2252</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Enumeration list-initialization from the same type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -14410,7 +14410,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2434">
     <td><a href="https://cplusplus.github.io/CWG/issues/2434.html">2434</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Mandatory copy elision vs non-class objects</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -14504,11 +14504,11 @@ and <I>POD class</I></td>
     <td>Thunks as an implementation technique for pointers to virtual functions</td>
     <td align="center">Extension</td>
   </tr>
-  <tr class="open" id="2450">
+  <tr id="2450">
     <td><a href="https://cplusplus.github.io/CWG/issues/2450.html">2450</a></td>
-    <td>review</td>
+    <td>DRWP</td>
     <td><I>braced-init-list</I> as a <I>template-argument</I></td>
-    <td title="Clang 18 implements P2308R1 resolution" align="center">Not Resolved*</td>
+    <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2451">
     <td><a href="https://cplusplus.github.io/CWG/issues/2451.html">2451</a></td>
@@ -14558,11 +14558,11 @@ and <I>POD class</I></td>
     <td>Value category of expressions denoting non-static member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2459">
+  <tr id="2459">
     <td><a href="https://cplusplus.github.io/CWG/issues/2459.html">2459</a></td>
-    <td>drafting</td>
+    <td>DRWP</td>
     <td>Template parameter initialization</td>
-    <td title="Clang 18 implements P2308R1 resolution" align="center">Not Resolved*</td>
+    <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2460">
     <td><a href="https://cplusplus.github.io/CWG/issues/2460.html">2460</a></td>
@@ -14662,7 +14662,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2476">
     <td><a href="https://cplusplus.github.io/CWG/issues/2476.html">2476</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td><I>placeholder-type-specifier</I>s and function declarators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -14830,7 +14830,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2504">
     <td><a href="https://cplusplus.github.io/CWG/issues/2504.html">2504</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Inheriting constructors from virtual base classes</td>
     <td class="none" align="center">No</td>
   </tr>
@@ -14992,7 +14992,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2531">
     <td><a href="https://cplusplus.github.io/CWG/issues/2531.html">2531</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Static data members redeclared as constexpr</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15002,11 +15002,11 @@ and <I>POD class</I></td>
     <td>Kind of pointer value returned by <TT>new T[0]</TT></td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2533">
+  <tr id="2533">
     <td><a href="https://cplusplus.github.io/CWG/issues/2533.html">2533</a></td>
-    <td>review</td>
+    <td>DR</td>
     <td>Storage duration of implicitly created objects</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2534">
     <td><a href="https://cplusplus.github.io/CWG/issues/2534.html">2534</a></td>
@@ -15082,13 +15082,13 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2546">
     <td><a href="https://cplusplus.github.io/CWG/issues/2546.html">2546</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Defaulted secondary comparison operators defined as deleted</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2547">
     <td><a href="https://cplusplus.github.io/CWG/issues/2547.html">2547</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Defaulted comparison operator function for non-classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15142,7 +15142,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2556">
     <td><a href="https://cplusplus.github.io/CWG/issues/2556.html">2556</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Unusable <TT>promise::return_void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15166,15 +15166,15 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2560">
     <td><a href="https://cplusplus.github.io/CWG/issues/2560.html">2560</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Parameter type determination in a <I>requirement-parameter-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2561">
     <td><a href="https://cplusplus.github.io/CWG/issues/2561.html">2561</a></td>
-    <td>review</td>
+    <td>tentatively ready</td>
     <td>Conversion to function pointer for lambda with explicit object parameter</td>
-    <td title="Clang 18 implements 2023-11-09 resolution" align="center">Not Resolved*</td>
+    <td title="Clang does not implement 2024-03-18 resolution" align="center">Not Resolved*</td>
   </tr>
   <tr class="open" id="2562">
     <td><a href="https://cplusplus.github.io/CWG/issues/2562.html">2562</a></td>
@@ -15214,7 +15214,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2568">
     <td><a href="https://cplusplus.github.io/CWG/issues/2568.html">2568</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Access checking during synthesis of defaulted comparison operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15226,7 +15226,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2570">
     <td><a href="https://cplusplus.github.io/CWG/issues/2570.html">2570</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Clarify constexpr for defaulted functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15334,7 +15334,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2588">
     <td><a href="https://cplusplus.github.io/CWG/issues/2588.html">2588</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>friend declarations and module linkage</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -15352,7 +15352,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2591">
     <td><a href="https://cplusplus.github.io/CWG/issues/2591.html">2591</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Implicit change of active union member for anonymous union in union</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15376,7 +15376,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2595">
     <td><a href="https://cplusplus.github.io/CWG/issues/2595.html">2595</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>"More constrained" for eligible special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15406,7 +15406,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2600">
     <td><a href="https://cplusplus.github.io/CWG/issues/2600.html">2600</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Type dependency of placeholder types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15574,7 +15574,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2628">
     <td><a href="https://cplusplus.github.io/CWG/issues/2628.html">2628</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Implicit deduction guides should propagate constraints</td>
     <td class="none" align="center">No</td>
   </tr>
@@ -15610,7 +15610,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2634">
     <td><a href="https://cplusplus.github.io/CWG/issues/2634.html">2634</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Avoid circularity in specification of scope for friend class declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15628,13 +15628,13 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2637">
     <td><a href="https://cplusplus.github.io/CWG/issues/2637.html">2637</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Injected-class-name as a <I>simple-template-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2638">
     <td><a href="https://cplusplus.github.io/CWG/issues/2638.html">2638</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Improve the example for initializing by initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15748,7 +15748,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2657">
     <td><a href="https://cplusplus.github.io/CWG/issues/2657.html">2657</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Cv-qualification adjustment when binding reference to temporary</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15770,11 +15770,11 @@ and <I>POD class</I></td>
     <td>Confusing term "this parameter"</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2661">
+  <tr id="2661">
     <td><a href="https://cplusplus.github.io/CWG/issues/2661.html">2661</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Missing disambiguation rule for <I>pure-specifier</I> vs. <I>brace-or-equal-initializer</I></td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2662">
     <td><a href="https://cplusplus.github.io/CWG/issues/2662.html">2662</a></td>
@@ -15814,7 +15814,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2668">
     <td><a href="https://cplusplus.github.io/CWG/issues/2668.html">2668</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td><TT>co_await</TT> in a <I>lambda-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15838,7 +15838,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2672">
     <td><a href="https://cplusplus.github.io/CWG/issues/2672.html">2672</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Lambda body SFINAE is still required, contrary to intent and note</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
@@ -15940,7 +15940,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2689">
     <td><a href="https://cplusplus.github.io/CWG/issues/2689.html">2689</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Are cv-qualified <TT>std::nullptr_t</TT> fundamental types?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16004,11 +16004,11 @@ and <I>POD class</I></td>
     <td>Inconsistency of <I>throw-expression</I> specification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2700">
+  <tr id="2700">
     <td><a href="https://cplusplus.github.io/CWG/issues/2700.html">2700</a></td>
-    <td>review</td>
+    <td>DR</td>
     <td><TT>#error</TT> disallows existing implementation practice</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2701">
     <td><a href="https://cplusplus.github.io/CWG/issues/2701.html">2701</a></td>
@@ -16048,7 +16048,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2707">
     <td><a href="https://cplusplus.github.io/CWG/issues/2707.html">2707</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Deduction guides cannot have a trailing <I>requires-clause</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16088,11 +16088,11 @@ and <I>POD class</I></td>
     <td>Initialization of reference-to-aggregate from designated initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2714">
+  <tr id="2714">
     <td><a href="https://cplusplus.github.io/CWG/issues/2714.html">2714</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Implicit deduction guides omit properties from the parameter-declaration-clause of a constructor</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2715">
     <td><a href="https://cplusplus.github.io/CWG/issues/2715.html">2715</a></td>
@@ -16156,7 +16156,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2725">
     <td><a href="https://cplusplus.github.io/CWG/issues/2725.html">2725</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Overload resolution for non-call of class member access</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16174,7 +16174,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2728">
     <td><a href="https://cplusplus.github.io/CWG/issues/2728.html">2728</a></td>
-    <td>open</td>
+    <td>tentatively ready</td>
     <td>Evaluation of conversions in a <I>delete-expression</I></td>
     <td align="center">Not resolved</td>
   </tr>
@@ -16204,7 +16204,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2733">
     <td><a href="https://cplusplus.github.io/CWG/issues/2733.html">2733</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Applying <TT>[[maybe_unused]]</TT> to a label</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16228,7 +16228,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2737">
     <td><a href="https://cplusplus.github.io/CWG/issues/2737.html">2737</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Temporary lifetime extension for reference init-captures</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -16258,7 +16258,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2742">
     <td><a href="https://cplusplus.github.io/CWG/issues/2742.html">2742</a></td>
-    <td>open</td>
+    <td>drafting</td>
     <td>Guaranteed copy elision for brace-initialization from prvalue</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -16274,33 +16274,33 @@ and <I>POD class</I></td>
     <td>Multiple objects of the same type at the same address</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2745">
+  <tr id="2745">
     <td><a href="https://cplusplus.github.io/CWG/issues/2745.html">2745</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Dependent odr-use in generic lambdas</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2746">
+  <tr id="2746">
     <td><a href="https://cplusplus.github.io/CWG/issues/2746.html">2746</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Checking of default template arguments</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2747">
     <td><a href="https://cplusplus.github.io/CWG/issues/2747.html">2747</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Cannot depend on an already-deleted splice</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2748">
     <td><a href="https://cplusplus.github.io/CWG/issues/2748.html">2748</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Accessing static data members via null pointer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2749">
     <td><a href="https://cplusplus.github.io/CWG/issues/2749.html">2749</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Treatment of "pointer to void" for relational comparisons</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16324,19 +16324,19 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2753">
     <td><a href="https://cplusplus.github.io/CWG/issues/2753.html">2753</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Storage reuse for string literal objects and backing arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2754">
     <td><a href="https://cplusplus.github.io/CWG/issues/2754.html">2754</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Using *this in explicit object member functions that are coroutines</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2755">
     <td><a href="https://cplusplus.github.io/CWG/issues/2755.html">2755</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Incorrect wording applied by P2738R1</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16354,43 +16354,43 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2758">
     <td><a href="https://cplusplus.github.io/CWG/issues/2758.html">2758</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>What is "access and ambiguity control"?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2759">
     <td><a href="https://cplusplus.github.io/CWG/issues/2759.html">2759</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>[[no_unique_address] and common initial sequence</td>
     <td class="unreleased" align="center">Clang 19</td>
   </tr>
   <tr id="2760">
     <td><a href="https://cplusplus.github.io/CWG/issues/2760.html">2760</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Defaulted constructor that is an immediate function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2761">
     <td><a href="https://cplusplus.github.io/CWG/issues/2761.html">2761</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Implicitly invoking the deleted destructor of an anonymous union member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2762">
     <td><a href="https://cplusplus.github.io/CWG/issues/2762.html">2762</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Type of implicit object parameter</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2763">
     <td><a href="https://cplusplus.github.io/CWG/issues/2763.html">2763</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Ignorability of [[noreturn]] during constant evaluation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2764">
     <td><a href="https://cplusplus.github.io/CWG/issues/2764.html">2764</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Use of placeholders affecting name mangling</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16403,7 +16403,8 @@ and <I>POD class</I></td>
   <tr class="open" id="2766">
     <td><a href="https://cplusplus.github.io/CWG/issues/2766.html">2766</a></td>
     <td>open</td>
-    <td>Repeated evaluation of a <I>string-literal</I> may yield different objects</td>
+    <td>Repeated evaluation of a <I>string-literal</I> may yield different
+objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2767">
@@ -16414,7 +16415,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2768">
     <td><a href="https://cplusplus.github.io/CWG/issues/2768.html">2768</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Assignment to enumeration variable with a <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16430,15 +16431,15 @@ and <I>POD class</I></td>
     <td>Trailing <I>requires-clause</I> can refer to function parameters before they are substituted into</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2771">
+  <tr id="2771">
     <td><a href="https://cplusplus.github.io/CWG/issues/2771.html">2771</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Transformation for <I>unqualified-id</I>s in address operator</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2772">
     <td><a href="https://cplusplus.github.io/CWG/issues/2772.html">2772</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Missing Annex C entry for linkage effects of <I>linkage-specification</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16456,7 +16457,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2775">
     <td><a href="https://cplusplus.github.io/CWG/issues/2775.html">2775</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Unclear argument type for copy of exception object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16466,15 +16467,15 @@ and <I>POD class</I></td>
     <td>Substitution failure and implementation limits</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2777">
+  <tr id="2777">
     <td><a href="https://cplusplus.github.io/CWG/issues/2777.html">2777</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Type of <I>id-expression</I> denoting a template parameter object</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2778">
     <td><a href="https://cplusplus.github.io/CWG/issues/2778.html">2778</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Trivial destructor does not imply constant destruction</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -16486,7 +16487,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2780">
     <td><a href="https://cplusplus.github.io/CWG/issues/2780.html">2780</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td><TT>reinterpret_cast</TT> to reference to function types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16504,7 +16505,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2783">
     <td><a href="https://cplusplus.github.io/CWG/issues/2783.html">2783</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Handling of deduction guides in <I>global-module-fragment</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16516,7 +16517,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2785">
     <td><a href="https://cplusplus.github.io/CWG/issues/2785.html">2785</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Type-dependence of <I>requires-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16540,7 +16541,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2789">
     <td><a href="https://cplusplus.github.io/CWG/issues/2789.html">2789</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Overload resolution with implicit and explicit object member functions</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
@@ -16552,19 +16553,19 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2791">
     <td><a href="https://cplusplus.github.io/CWG/issues/2791.html">2791</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Unclear phrasing about "returning to the caller"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2792">
     <td><a href="https://cplusplus.github.io/CWG/issues/2792.html">2792</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Clean up specification of <TT>noexcept</TT> operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2793">
     <td><a href="https://cplusplus.github.io/CWG/issues/2793.html">2793</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Block-scope declaration conflicting with parameter name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16576,25 +16577,25 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2795">
     <td><a href="https://cplusplus.github.io/CWG/issues/2795.html">2795</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Overlapping empty subobjects with different cv-qualification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2796">
     <td><a href="https://cplusplus.github.io/CWG/issues/2796.html">2796</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Function pointer conversions for relational operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2797">
     <td><a href="https://cplusplus.github.io/CWG/issues/2797.html">2797</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Meaning of "corresponds" for rewritten operator candidates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2798">
     <td><a href="https://cplusplus.github.io/CWG/issues/2798.html">2798</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Manifestly constant evaluation of the <TT>static_assert</TT> message</td>
     <td class="full" align="center">Clang 17</td>
   </tr>
@@ -16612,7 +16613,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2801">
     <td><a href="https://cplusplus.github.io/CWG/issues/2801.html">2801</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Reference binding with reference-related types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16624,7 +16625,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2803">
     <td><a href="https://cplusplus.github.io/CWG/issues/2803.html">2803</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Overload resolution for reference binding of similar types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16642,13 +16643,13 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2806">
     <td><a href="https://cplusplus.github.io/CWG/issues/2806.html">2806</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Make a <I>type-requirement</I> a type-only context</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2807">
     <td><a href="https://cplusplus.github.io/CWG/issues/2807.html">2807</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Destructors declared <TT>consteval</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16660,19 +16661,19 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2809">
     <td><a href="https://cplusplus.github.io/CWG/issues/2809.html">2809</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>An implicit definition does not redeclare a function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2810">
     <td><a href="https://cplusplus.github.io/CWG/issues/2810.html">2810</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Requiring the absence of diagnostics for templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2811">
     <td><a href="https://cplusplus.github.io/CWG/issues/2811.html">2811</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Clarify "use" of main</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16682,11 +16683,11 @@ and <I>POD class</I></td>
     <td>Allocation with explicit alignment</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2813">
+  <tr id="2813">
     <td><a href="https://cplusplus.github.io/CWG/issues/2813.html">2813</a></td>
-    <td>review</td>
+    <td>DR</td>
     <td>Class member access with prvalues</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2814">
     <td><a href="https://cplusplus.github.io/CWG/issues/2814.html">2814</a></td>
@@ -16714,57 +16715,57 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2818">
     <td><a href="https://cplusplus.github.io/CWG/issues/2818.html">2818</a></td>
-    <td>review</td>
+    <td>tentatively ready</td>
     <td>Use of predefined reserved identifiers</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2819">
     <td><a href="https://cplusplus.github.io/CWG/issues/2819.html">2819</a></td>
-    <td>review</td>
+    <td>tentatively ready</td>
     <td>Cast from null pointer value in a constant expression</td>
-    <td align="center">Not resolved</td>
+    <td title="Clang 19 implements 2023-12-01 resolution" align="center">Not Resolved*</td>
   </tr>
-  <tr class="open" id="2820">
+  <tr id="2820">
     <td><a href="https://cplusplus.github.io/CWG/issues/2820.html">2820</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Value-initialization and default constructors</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2821">
     <td><a href="https://cplusplus.github.io/CWG/issues/2821.html">2821</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Lifetime, zero-initialization, and dynamic initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2822">
     <td><a href="https://cplusplus.github.io/CWG/issues/2822.html">2822</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Side-effect-free pointer zap</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2823">
     <td><a href="https://cplusplus.github.io/CWG/issues/2823.html">2823</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Implicit undefined behavior when dereferencing pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2824">
     <td><a href="https://cplusplus.github.io/CWG/issues/2824.html">2824</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Copy-initialization of arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2825">
     <td><a href="https://cplusplus.github.io/CWG/issues/2825.html">2825</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Range-based for statement using a <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr id="2826">
+  <tr class="open" id="2826">
     <td><a href="https://cplusplus.github.io/CWG/issues/2826.html">2826</a></td>
-    <td>tentatively ready</td>
+    <td>drafting</td>
     <td>Missing definition of "temporary expression"</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2827">
     <td><a href="https://cplusplus.github.io/CWG/issues/2827.html">2827</a></td>
@@ -16772,11 +16773,11 @@ and <I>POD class</I></td>
     <td>Representation of unsigned integral types</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2828">
+  <tr id="2828">
     <td><a href="https://cplusplus.github.io/CWG/issues/2828.html">2828</a></td>
-    <td>review</td>
+    <td>DR</td>
     <td>Ambiguous interpretation of C-style cast</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2829">
     <td><a href="https://cplusplus.github.io/CWG/issues/2829.html">2829</a></td>
@@ -16784,17 +16785,17 @@ and <I>POD class</I></td>
     <td>Redundant case in restricting user-defined conversion sequences</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2830">
+  <tr id="2830">
     <td><a href="https://cplusplus.github.io/CWG/issues/2830.html">2830</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Top-level cv-qualification should be ignored for list-initialization</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2831">
+  <tr id="2831">
     <td><a href="https://cplusplus.github.io/CWG/issues/2831.html">2831</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Non-templated function definitions and <I>requires-clause</I>s</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2832">
     <td><a href="https://cplusplus.github.io/CWG/issues/2832.html">2832</a></td>
@@ -16810,7 +16811,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2834">
     <td><a href="https://cplusplus.github.io/CWG/issues/2834.html">2834</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Partial ordering and explicit object parameters</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -16822,7 +16823,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2836">
     <td><a href="https://cplusplus.github.io/CWG/issues/2836.html">2836</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Conversion rank of <TT>long double</TT> and extended floating-point types</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -16855,6 +16856,276 @@ and <I>POD class</I></td>
     <td>open</td>
     <td>When do const objects start being const?</td>
     <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2842">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2842.html">2842</a></td>
+    <td>open</td>
+    <td>Preferring an <TT>initializer_list</TT> over a single value</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2843">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2843.html">2843</a></td>
+    <td>review</td>
+    <td>Undated reference to Unicode makes C++ a moving target</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2844">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2844.html">2844</a></td>
+    <td>open</td>
+    <td>Enumerating a finite set of built-in candidates</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr id="2845">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2845.html">2845</a></td>
+    <td>DR</td>
+    <td>Make the closure type of a captureless lambda a structural type</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr id="2846">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2846.html">2846</a></td>
+    <td>DR</td>
+    <td>Out-of-class definitions of explicit object member functions</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr class="open" id="2847">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2847.html">2847</a></td>
+    <td>review</td>
+    <td>Constrained explicit specializations of function templates at class scope</td>
+    <td title="Clang 19 implements 2024-03-01 resolution" align="center">Not Resolved*</td>
+  </tr>
+  <tr id="2848">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2848.html">2848</a></td>
+    <td>DR</td>
+    <td>Omitting an empty template argument list for explicit instantiation</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr id="2849">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2849.html">2849</a></td>
+    <td>DR</td>
+    <td>Parameter objects are not temporary objects</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr id="2850">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2850.html">2850</a></td>
+    <td>DR</td>
+    <td>Unclear storage duration for function parameter objects</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr id="2851">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2851.html">2851</a></td>
+    <td>DR</td>
+    <td>Allow floating-point conversions in converted constant expressions</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr class="open" id="2852">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2852.html">2852</a></td>
+    <td>open</td>
+    <td>Complete-class contexts and class-scope lambdas</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr id="2853">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2853.html">2853</a></td>
+    <td>DR</td>
+    <td>Pointer arithmetic with pointer to hypothetical element</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr id="2854">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2854.html">2854</a></td>
+    <td>DR</td>
+    <td>Storage duration of exception objects</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr id="2855">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2855.html">2855</a></td>
+    <td>DR</td>
+    <td>Undefined behavior in postfix increment</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr id="2856">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2856.html">2856</a></td>
+    <td>DR</td>
+    <td>Copy-list-initialization with explicit default constructors</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr id="2857">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2857.html">2857</a></td>
+    <td>DR</td>
+    <td>Argument-dependent lookup with incomplete class types</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr class="open" id="2858">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2858.html">2858</a></td>
+    <td>tentatively ready</td>
+    <td>Declarative <I>nested-name-specifier</I>s and <I>pack-index-specifier</I>s</td>
+    <td title="Clang 19 implements 2024-04-05 resolution" align="center">Not Resolved*</td>
+  </tr>
+  <tr class="open" id="2859">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2859.html">2859</a></td>
+    <td>tentatively ready</td>
+    <td>Value-initialization with multiple default constructors</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr id="2860">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2860.html">2860</a></td>
+    <td>dup</td>
+    <td>Remove and fix the term "vacuous initialization"</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr class="open" id="2861">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2861.html">2861</a></td>
+    <td>tentatively ready</td>
+    <td><TT>dynamic_cast</TT> on bad pointer value</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2862">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2862.html">2862</a></td>
+    <td>tentatively ready</td>
+    <td>Unclear boundaries of template declarations</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2863">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2863.html">2863</a></td>
+    <td>tentatively ready</td>
+    <td>Unclear synchronization requirements for object lifetime rules</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2864">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2864.html">2864</a></td>
+    <td>tentatively ready</td>
+    <td>Narrowing floating-point conversions</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2865">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2865.html">2865</a></td>
+    <td>open</td>
+    <td>Regression on result of conditional operator</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2866">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2866.html">2866</a></td>
+    <td>open</td>
+    <td>Observing the effects of <TT>[[no_unique_address]]</TT></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2867">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2867.html">2867</a></td>
+    <td>open</td>
+    <td>Order of initialization for structured bindings</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2868">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2868.html">2868</a></td>
+    <td>open</td>
+    <td>Self-references in trivially copyable objects as function return values</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2869">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2869.html">2869</a></td>
+    <td>open</td>
+    <td><TT>this</TT> in local classes</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2870">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2870.html">2870</a></td>
+    <td>open</td>
+    <td>Combining absent <I>encoding-prefix</I>es</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2871">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2871.html">2871</a></td>
+    <td>tentatively ready</td>
+    <td>User-declared constructor templates inhibiting default constructors</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2872">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2872.html">2872</a></td>
+    <td>open</td>
+    <td>Linkage and unclear "can be referred to"</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2873">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2873.html">2873</a></td>
+    <td>open</td>
+    <td>Taking the address of a function involving template argument deduction</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2874">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2874.html">2874</a></td>
+    <td>open</td>
+    <td>Qualified declarations of partial specializations</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2875">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2875.html">2875</a></td>
+    <td>open</td>
+    <td>Missing support for round-tripping nullptr through indirection/address operators</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2876">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2876.html">2876</a></td>
+    <td>open</td>
+    <td>Disambiguation of <TT>T x = delete("text")</TT></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2877">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2877.html">2877</a></td>
+    <td>open</td>
+    <td>Type-only lookup for <I>using-enum-declarator</I></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2878">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2878.html">2878</a></td>
+    <td>open</td>
+    <td>C-style casts to reference types</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2879">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2879.html">2879</a></td>
+    <td>open</td>
+    <td>Undesired outcomes with <TT>const_cast</TT></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2880">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2880.html">2880</a></td>
+    <td>open</td>
+    <td>Accessibility check for destructor of incomplete class type</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2881">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2881.html">2881</a></td>
+    <td>open</td>
+    <td>Type restrictions for the explicit object parameter of a lambda</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2882">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2882.html">2882</a></td>
+    <td>open</td>
+    <td>Unclear treatment of conversion to <TT>void</TT></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2883">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2883.html">2883</a></td>
+    <td>open</td>
+    <td>Definition of "odr-usable" ignores lambda scopes</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2884">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2884.html">2884</a></td>
+    <td>open</td>
+    <td>Qualified declarations of partial specializations</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2885">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2885.html">2885</a></td>
+    <td>open</td>
+    <td>Non-eligible trivial default constructors</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2886">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2886.html">2886</a></td>
+    <td>open</td>
+    <td>Temporaries and trivial potentially-throwing special member functions</td>
+    <td align="center">Not resolved</td>
   </tr></table>
 
 </div>
diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html
index 0d796597d05c..0996abc24058 100755
--- a/clang/www/cxx_status.html
+++ b/clang/www/cxx_status.html
@@ -167,7 +167,7 @@ C++23, informally referred to as C++26.</p>
  <tr>
   <td>Disallow Binding a Returned Glvalue to a Temporary</td>
   <td><a href="https://wg21.link/P2748R5">P2748R5</a></td>
-  <td class="none" align="center">No</td>
+  <td class="full" align="center">Clang 19</td>
  </tr>
  <tr>
   <td>Clarifying rules for brace elision in aggregate initialization</td>
@@ -177,7 +177,7 @@ C++23, informally referred to as C++26.</p>
  <tr>
   <td>Attributes for Structured Bindings</td>
   <td><a href="https://wg21.link/P0609R3">P0609R3</a></td>
-  <td class="none" align="center">No</td>
+  <td class="none" align="center">Clang 19</td>
  </tr>
  <tr>
   <td>Module Declarations Shouldn’t be Macros</td>
@@ -187,7 +187,7 @@ C++23, informally referred to as C++26.</p>
  <tr>
   <td>Trivial infinite loops are not Undefined Behavior</td>
   <td><a href="https://wg21.link/P2809R3">P2809R3</a> (<a href="#dr">DR</a>)</td>
-  <td class="unreleased" align="center">Clang 19</td>
+  <td class="unreleased" align="center">No</td>
  </tr>
  <tr>
   <td>Erroneous behaviour for uninitialized reads</td>
diff --git a/clang/www/make_cxx_dr_status b/clang/www/make_cxx_dr_status
index 7c0cf77a1524..47c8b3bae4a1 100755
--- a/clang/www/make_cxx_dr_status
+++ b/clang/www/make_cxx_dr_status
@@ -5,7 +5,7 @@ latest_release = 18
 
 clang_www_dir = os.path.dirname(__file__)
 default_issue_list_path = os.path.join(clang_www_dir, 'cwg_index.html')
-issue_list_url = "https://www.open-std.org/jtc1/sc22/wg21/docs/cwg_index.html"
+issue_list_url = "https://raw.githubusercontent.com/cplusplus/CWG/gh-pages/issues/cwg_index.html"
 output = os.path.join(clang_www_dir, 'cxx_dr_status.html')
 dr_test_dir = os.path.join(clang_www_dir, '../test/CXX/drs')
 
@@ -138,10 +138,10 @@ def availability(issue):
 
   unresolved_status = ''
   proposed_resolution = ''
-  unresolved_status_match = re.search(r' (open|drafting|review)', status)
+  unresolved_status_match = re.search(r' (open|drafting|review|tentatively ready)', status)
   if unresolved_status_match:
     unresolved_status = unresolved_status_match.group(1)
-    proposed_resolution_match = re.search(r' (open|drafting|review) (\d{4}-\d{2}(?:-\d{2})?|P\d{4}R\d+)$', status)
+    proposed_resolution_match = re.search(r' (open|drafting|review|tentatively ready) (\d{4}-\d{2}(?:-\d{2})?|P\d{4}R\d+)$', status)
     if proposed_resolution_match is None:
       raise AvailabilityError('Issue {}: \'{}\' status should be followed by a paper number (P1234R5) or proposed resolution in YYYY-MM-DD format'.format(dr.issue, unresolved_status))
     proposed_resolution = proposed_resolution_match.group(2)
@@ -236,7 +236,7 @@ for dr in drs:
     avail = 'Extension'
     avail_style = ''
 
-  elif dr.status in ('open', 'drafting', 'review'):
+  elif dr.status in ('open', 'drafting', 'review', 'tentatively ready'):
     row_style = ' class="open"'
     try:
       avail, avail_style, unresolved_status = availability(dr.issue)
diff --git a/compiler-rt/lib/scudo/standalone/allocator_config.def b/compiler-rt/lib/scudo/standalone/allocator_config.def
index 9691a007eed5..dcd130ac449a 100644
--- a/compiler-rt/lib/scudo/standalone/allocator_config.def
+++ b/compiler-rt/lib/scudo/standalone/allocator_config.def
@@ -89,6 +89,7 @@ PRIMARY_REQUIRED(const s32, MaxReleaseToOsIntervalMs)
 // Indicates support for offsetting the start of a region by a random number of
 // pages. This is only used if `EnableContiguousRegions` is enabled.
 PRIMARY_OPTIONAL(const bool, EnableRandomOffset, false)
+PRIMARY_OPTIONAL(const s32, DefaultReleaseToOsIntervalMs, INT32_MIN)
 
 // When `EnableContiguousRegions` is true, all regions will be be arranged in
 // adjacency. This will reduce the fragmentation caused by region allocations
@@ -118,6 +119,7 @@ SECONDARY_CACHE_OPTIONAL(const u32, DefaultMaxEntriesCount, 0)
 SECONDARY_CACHE_OPTIONAL(const uptr, DefaultMaxEntrySize, 0)
 SECONDARY_CACHE_OPTIONAL(const s32, MinReleaseToOsIntervalMs, INT32_MIN)
 SECONDARY_CACHE_OPTIONAL(const s32, MaxReleaseToOsIntervalMs, INT32_MAX)
+SECONDARY_CACHE_OPTIONAL(const s32, DefaultReleaseToOsIntervalMs, INT32_MIN)
 
 #undef SECONDARY_CACHE_OPTIONAL
 #undef SECONDARY_REQUIRED_TEMPLATE_TYPE
diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index e7bc90cd0960..927513dea92d 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -173,6 +173,9 @@ public:
         static_cast<u32>(getFlags()->quarantine_max_chunk_size);
 
     Stats.init();
+    // TODO(chiahungduan): Given that we support setting the default value in
+    // the PrimaryConfig and CacheConfig, consider to deprecate the use of
+    // `release_to_os_interval_ms` flag.
     const s32 ReleaseToOsIntervalMs = getFlags()->release_to_os_interval_ms;
     Primary.init(ReleaseToOsIntervalMs);
     Secondary.init(&Stats, ReleaseToOsIntervalMs);
diff --git a/compiler-rt/lib/scudo/standalone/flags.inc b/compiler-rt/lib/scudo/standalone/flags.inc
index f5a2bab5057a..ff0c28e1db7c 100644
--- a/compiler-rt/lib/scudo/standalone/flags.inc
+++ b/compiler-rt/lib/scudo/standalone/flags.inc
@@ -42,7 +42,7 @@ SCUDO_FLAG(bool, may_return_null, true,
            "returning NULL in otherwise non-fatal error scenarios, eg: OOM, "
            "invalid allocation alignments, etc.")
 
-SCUDO_FLAG(int, release_to_os_interval_ms, SCUDO_ANDROID ? INT32_MIN : 5000,
+SCUDO_FLAG(int, release_to_os_interval_ms, 5000,
            "Interval (in milliseconds) at which to attempt release of unused "
            "memory to the OS. Negative values disable the feature.")
 
diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
index 1d8a77b73e5c..ebfb8dfe0a31 100644
--- a/compiler-rt/lib/scudo/standalone/primary32.h
+++ b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -88,6 +88,10 @@ public:
       Sci->MinRegionIndex = NumRegions;
       Sci->ReleaseInfo.LastReleaseAtNs = Time;
     }
+
+    // The default value in the primary config has the higher priority.
+    if (Config::getDefaultReleaseToOsIntervalMs() != INT32_MIN)
+      ReleaseToOsInterval = Config::getDefaultReleaseToOsIntervalMs();
     setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
 
diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index d6119051b162..bed2ccb8b992 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -147,6 +147,9 @@ public:
     for (uptr I = 0; I < NumClasses; I++)
       getRegionInfo(I)->FLLockCV.bindTestOnly(getRegionInfo(I)->FLLock);
 
+    // The default value in the primary config has the higher priority.
+    if (Config::getDefaultReleaseToOsIntervalMs() != INT32_MIN)
+      ReleaseToOsInterval = Config::getDefaultReleaseToOsIntervalMs();
     setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
 
diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h
index 674af5071775..d8c9f5bcfcaf 100644
--- a/compiler-rt/lib/scudo/standalone/secondary.h
+++ b/compiler-rt/lib/scudo/standalone/secondary.h
@@ -209,6 +209,9 @@ public:
               static_cast<sptr>(Config::getDefaultMaxEntriesCount()));
     setOption(Option::MaxCacheEntrySize,
               static_cast<sptr>(Config::getDefaultMaxEntrySize()));
+    // The default value in the cache config has the higher priority.
+    if (Config::getDefaultReleaseToOsIntervalMs() != INT32_MIN)
+      ReleaseToOsInterval = Config::getDefaultReleaseToOsIntervalMs();
     setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
 
diff --git a/flang/cmake/modules/AddFlangOffloadRuntime.cmake b/flang/cmake/modules/AddFlangOffloadRuntime.cmake
index e34d3851187a..0af12c8cfd54 100644
--- a/flang/cmake/modules/AddFlangOffloadRuntime.cmake
+++ b/flang/cmake/modules/AddFlangOffloadRuntime.cmake
@@ -2,6 +2,10 @@ option(FLANG_EXPERIMENTAL_CUDA_RUNTIME
   "Compile Fortran runtime as CUDA sources (experimental)" OFF
   )
 
+option(FLANG_CUDA_RUNTIME_PTX_WITHOUT_GLOBAL_VARS
+  "Do not compile global variables' definitions when producing PTX library" OFF
+  )
+
 set(FLANG_LIBCUDACXX_PATH "" CACHE PATH "Path to libcu++ package installation")
 
 set(FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD "off" CACHE STRING
@@ -56,6 +60,11 @@ macro(enable_cuda_compilation name files)
     # Add an OBJECT library consisting of CUDA PTX.
     llvm_add_library(${name}PTX OBJECT PARTIAL_SOURCES_INTENDED ${files})
     set_property(TARGET obj.${name}PTX PROPERTY CUDA_PTX_COMPILATION ON)
+    if (FLANG_CUDA_RUNTIME_PTX_WITHOUT_GLOBAL_VARS)
+      target_compile_definitions(obj.${name}PTX
+        PRIVATE FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
+        )
+    endif()
   endif()
 endmacro()
 
diff --git a/flang/docs/FlangDriver.md b/flang/docs/FlangDriver.md
index ac120b4ff09b..351595ac0afd 100644
--- a/flang/docs/FlangDriver.md
+++ b/flang/docs/FlangDriver.md
@@ -179,46 +179,20 @@ like this:
 
 ```
 $ flang -v -o example example.o
-"/usr/bin/ld" [...] example.o [...] "--whole-archive" "-lFortran_main"
-"--no-whole-archive" "-lFortranRuntime" "-lFortranDecimal" [...]
+"/usr/bin/ld" [...] example.o [...] "-lFortranRuntime" "-lFortranDecimal" [...]
 ```
 
 The automatically added libraries are:
 
-* `Fortran_main`: Provides the main entry point `main` that then invokes
-  `_QQmain` with the Fortran program unit.  This library has a dependency to
-  the `FortranRuntime` library.
 * `FortranRuntime`: Provides most of the Flang runtime library.
 * `FortranDecimal`: Provides operations for decimal numbers.
 
-The default is that, when using Flang as the linker, one of the Fortran
-translation units provides the program unit and therefore it is assumed that
-Fortran is the main code part (calling into C/C++ routines via `BIND (C)`
-interfaces).  When composing the linker commandline, Flang uses
-`--whole-archive` and `--no-whole-archive` (Windows: `/WHOLEARCHIVE:`,
-Darwin & AIX: *not implemented yet*) to make sure that all for `Fortran_main`
-is processed by the linker.  This is done to issue a proper error message when
-multiple definitions of `main` occur.  This happens, for instance, when linking
-a code that has a Fortran program unit with a C/C++ code that also defines a
-`main` function.  A user may be required to explicitly provide the C++ runtime
-libraries at link time (e.g., via `-lstdc++` for STL)
-
 If the code is C/C++ based and invokes Fortran routines, one can either use Clang
 or Flang as the linker driver.  If Clang is used, it will automatically all
 required runtime libraries needed by C++ (e.g., for STL) to the linker invocation.
 In this case, one has to explicitly provide the Fortran runtime libraries
-`FortranRuntime` and/or `FortranDecimal`.  An alternative is to use Flang to link
-and use the `-fno-fortran-main` flag.  This flag removes
-`Fortran_main` from the linker stage and hence requires one of the C/C++
-translation units to provide a definition of the `main` function. In this case,
-it may be required to explicitly supply C++ runtime libraries as mentioned above.
-
-When creating shared or static libraries using Flang with `-shared` or `-static`
-flag, Fortran_main is automatically removed from the linker stage (i.e.,
-`-fno-fortran-main` is on by default).  It is assumed that when creating a
-static or shared library, the generated library does not need a `main`
-function, as a final link stage will occur that will provide the `Fortran_main`
-library when creating the final executable.
+`FortranRuntime` and/or `FortranDecimal`.  An alternative is to use Flang to link.
+In this case, it may be required to explicitly supply C++ runtime libraries.
 
 On Darwin, the logical root where the system libraries are located (sysroot)
 must be specified. This can be done with the CMake build flag `DEFAULT_SYSROOT`
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/EnvironmentDefaults.h b/flang/include/flang/Optimizer/Builder/Runtime/EnvironmentDefaults.h
index 18a24bad3960..216d3bcec137 100755
--- a/flang/include/flang/Optimizer/Builder/Runtime/EnvironmentDefaults.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/EnvironmentDefaults.h
@@ -22,6 +22,7 @@
 
 namespace fir {
 class FirOpBuilder;
+class GlobalOp;
 } // namespace fir
 
 namespace mlir {
@@ -37,7 +38,7 @@ namespace fir::runtime {
 /// Create the list of environment variable defaults for the runtime to set. The
 /// form of the generated list is defined in the runtime header file
 /// environment-default-list.h
-void genEnvironmentDefaults(
+fir::GlobalOp genEnvironmentDefaults(
     fir::FirOpBuilder &builder, mlir::Location loc,
     const std::vector<Fortran::lower::EnvironmentDefault> &envDefaults);
 
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Main.h b/flang/include/flang/Optimizer/Builder/Runtime/Main.h
new file mode 100644
index 000000000000..62faf46e1fc7
--- /dev/null
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Main.h
@@ -0,0 +1,28 @@
+//===-- Main.h - generate main runtime API calls ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_BUILDER_RUNTIME_MAIN_H
+#define FORTRAN_OPTIMIZER_BUILDER_RUNTIME_MAIN_H
+
+namespace mlir {
+class Location;
+} // namespace mlir
+
+namespace fir {
+class FirOpBuilder;
+class GlobalOp;
+} // namespace fir
+
+namespace fir::runtime {
+
+void genMain(fir::FirOpBuilder &builder, mlir::Location loc,
+             fir::GlobalOp &env);
+
+}
+
+#endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_MAIN_H
diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc
index 8f197038f2ba..34af9f1c21f8 100644
--- a/flang/include/flang/Tools/CLOptions.inc
+++ b/flang/include/flang/Tools/CLOptions.inc
@@ -104,7 +104,7 @@ void addNestedPassToOps(mlir::PassManager &pm, PassConstructor ctor) {
 void addNestedPassToAllTopLevelOperations(
     mlir::PassManager &pm, PassConstructor ctor) {
   addNestedPassToOps<mlir::func::FuncOp, mlir::omp::DeclareReductionOp,
-      fir::GlobalOp>(pm, ctor);
+      mlir::omp::PrivateClauseOp, fir::GlobalOp>(pm, ctor);
 }
 
 void addNestedPassToAllTopLevelOperationsConditionally(mlir::PassManager &pm,
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index c05bf010b2bd..b42909eaaacc 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -36,6 +36,7 @@
 #include "flang/Optimizer/Builder/Runtime/Character.h"
 #include "flang/Optimizer/Builder/Runtime/Derived.h"
 #include "flang/Optimizer/Builder/Runtime/EnvironmentDefaults.h"
+#include "flang/Optimizer/Builder/Runtime/Main.h"
 #include "flang/Optimizer/Builder/Runtime/Ragged.h"
 #include "flang/Optimizer/Builder/Runtime/Stop.h"
 #include "flang/Optimizer/Builder/Todo.h"
@@ -359,8 +360,10 @@ public:
         // not need to be generated even if no defaults are specified.
         // However, generating main or changing when the runtime reads
         // environment variables is required to do so.
-        fir::runtime::genEnvironmentDefaults(*builder, toLocation(),
-                                             bridge.getEnvironmentDefaults());
+        auto env = fir::runtime::genEnvironmentDefaults(
+            *builder, toLocation(), bridge.getEnvironmentDefaults());
+
+        fir::runtime::genMain(*builder, toLocation(), env);
       });
 
     finalizeOpenACCLowering();
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index edf1f24a08e5..413563fe95ca 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -496,8 +496,8 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
   if (mlir::isa<fir::SequenceType>(symTy) &&
       !Fortran::semantics::IsAllocatableOrPointer(sym)) {
     mlir::Type eleTy = mlir::cast<fir::SequenceType>(symTy).getEleTy();
-    if (eleTy.isa<mlir::IntegerType, mlir::FloatType, fir::ComplexType,
-                  fir::LogicalType>()) {
+    if (mlir::isa<mlir::IntegerType, mlir::FloatType, fir::ComplexType,
+                  fir::LogicalType>(eleTy)) {
       const auto *details =
           sym.detailsIf<Fortran::semantics::ObjectEntityDetails>();
       if (details->init()) {
diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt
index 06339b116cd8..6d0aeb429d35 100644
--- a/flang/lib/Optimizer/Builder/CMakeLists.txt
+++ b/flang/lib/Optimizer/Builder/CMakeLists.txt
@@ -23,6 +23,7 @@ add_flang_library(FIRBuilder
   Runtime/Execute.cpp
   Runtime/Inquiry.cpp
   Runtime/Intrinsics.cpp
+  Runtime/Main.cpp
   Runtime/Numeric.cpp
   Runtime/Pointer.cpp
   Runtime/Ragged.cpp
diff --git a/flang/lib/Optimizer/Builder/Runtime/EnvironmentDefaults.cpp b/flang/lib/Optimizer/Builder/Runtime/EnvironmentDefaults.cpp
index a11b9339681e..6e280ac0c06c 100755
--- a/flang/lib/Optimizer/Builder/Runtime/EnvironmentDefaults.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/EnvironmentDefaults.cpp
@@ -13,7 +13,7 @@
 #include "flang/Optimizer/Support/InternalNames.h"
 #include "llvm/ADT/ArrayRef.h"
 
-void fir::runtime::genEnvironmentDefaults(
+fir::GlobalOp fir::runtime::genEnvironmentDefaults(
     fir::FirOpBuilder &builder, mlir::Location loc,
     const std::vector<Fortran::lower::EnvironmentDefault> &envDefaults) {
   std::string envDefaultListPtrName =
@@ -34,14 +34,13 @@ void fir::runtime::genEnvironmentDefaults(
 
   // If no defaults were specified, initialize with a null pointer.
   if (envDefaults.empty()) {
-    builder.createGlobalConstant(
+    return builder.createGlobalConstant(
         loc, envDefaultListRefTy, envDefaultListPtrName,
         [&](fir::FirOpBuilder &builder) {
           mlir::Value nullVal =
               builder.createNullConstant(loc, envDefaultListRefTy);
           builder.create<fir::HasValueOp>(loc, nullVal);
         });
-    return;
   }
 
   // Create the Item list.
@@ -99,7 +98,7 @@ void fir::runtime::genEnvironmentDefaults(
       envDefaultListBuilder, linkOnce);
 
   // Define the pointer to the list used by the runtime.
-  builder.createGlobalConstant(
+  return builder.createGlobalConstant(
       loc, envDefaultListRefTy, envDefaultListPtrName,
       [&](fir::FirOpBuilder &builder) {
         mlir::Value addr = builder.create<fir::AddrOfOp>(
diff --git a/flang/lib/Optimizer/Builder/Runtime/Main.cpp b/flang/lib/Optimizer/Builder/Runtime/Main.cpp
new file mode 100644
index 000000000000..3b24fbca9cdb
--- /dev/null
+++ b/flang/lib/Optimizer/Builder/Runtime/Main.cpp
@@ -0,0 +1,62 @@
+//===-- Main.cpp - generate main runtime API calls --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Builder/Runtime/Main.h"
+#include "flang/Optimizer/Builder/BoxValue.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Runtime/main.h"
+#include "flang/Runtime/stop.h"
+
+using namespace Fortran::runtime;
+
+/// Create a `int main(...)` that calls the Fortran entry point
+void fir::runtime::genMain(fir::FirOpBuilder &builder, mlir::Location loc,
+                           fir::GlobalOp &env) {
+  auto *context = builder.getContext();
+  auto argcTy = builder.getDefaultIntegerType();
+  auto ptrTy = mlir::LLVM::LLVMPointerType::get(context);
+
+  // void ProgramStart(int argc, char** argv, char** envp,
+  //                   _QQEnvironmentDefaults* env)
+  auto startFn = builder.createFunction(
+      loc, RTNAME_STRING(ProgramStart),
+      mlir::FunctionType::get(context, {argcTy, ptrTy, ptrTy, ptrTy}, {}));
+  // void ProgramStop()
+  auto stopFn =
+      builder.createFunction(loc, RTNAME_STRING(ProgramEndStatement),
+                             mlir::FunctionType::get(context, {}, {}));
+
+  // int main(int argc, char** argv, char** envp)
+  auto mainFn = builder.createFunction(
+      loc, "main",
+      mlir::FunctionType::get(context, {argcTy, ptrTy, ptrTy}, argcTy));
+  // void _QQmain()
+  auto qqMainFn = builder.createFunction(
+      loc, "_QQmain", mlir::FunctionType::get(context, {}, {}));
+
+  mainFn.setPublic();
+
+  auto *block = mainFn.addEntryBlock();
+  mlir::OpBuilder::InsertionGuard insertGuard(builder);
+  builder.setInsertionPointToStart(block);
+
+  llvm::SmallVector<mlir::Value, 4> args(block->getArguments());
+  auto envAddr =
+      builder.create<fir::AddrOfOp>(loc, env.getType(), env.getSymbol());
+  args.push_back(envAddr);
+
+  builder.create<fir::CallOp>(loc, startFn, args);
+  builder.create<fir::CallOp>(loc, qqMainFn);
+  builder.create<fir::CallOp>(loc, stopFn);
+
+  mlir::Value ret = builder.createIntegerConstant(loc, argcTy, 0);
+  builder.create<mlir::func::ReturnOp>(loc, ret);
+}
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 19628ac71b0b..b4705aa47992 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -1964,12 +1964,12 @@ struct ValueOpCommon {
                  mlir::ArrayAttr arrAttr) {
     llvm::SmallVector<int64_t> indices;
     for (auto i = arrAttr.begin(), e = arrAttr.end(); i != e; ++i) {
-      if (auto intAttr = i->dyn_cast<mlir::IntegerAttr>()) {
+      if (auto intAttr = mlir::dyn_cast<mlir::IntegerAttr>(*i)) {
         indices.push_back(intAttr.getInt());
       } else {
-        auto fieldName = i->cast<mlir::StringAttr>().getValue();
+        auto fieldName = mlir::cast<mlir::StringAttr>(*i).getValue();
         ++i;
-        auto ty = i->cast<mlir::TypeAttr>().getValue();
+        auto ty = mlir::cast<mlir::TypeAttr>(*i).getValue();
         auto index = mlir::cast<fir::RecordType>(ty).getFieldIndex(fieldName);
         indices.push_back(index);
       }
@@ -3014,7 +3014,7 @@ static void selectMatchAndRewrite(const fir::LLVMTypeConverter &lowering,
       caseValues.push_back(intAttr.getInt());
       continue;
     }
-    assert(attr.template dyn_cast_or_null<mlir::UnitAttr>());
+    assert(mlir::dyn_cast_or_null<mlir::UnitAttr>(attr));
     assert((t + 1 == conds) && "unit must be last");
     defaultDestination = dest;
     defaultOperands = destOps ? *destOps : mlir::ValueRange{};
diff --git a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
index 00c5f77cde7c..d6dac4998fdc 100644
--- a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
+++ b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
@@ -243,6 +243,9 @@ ConvertFIRToLLVMPattern::getBlockForAllocaInsert(mlir::Operation *op) const {
     return iface.getAllocaBlock();
   if (auto llvmFuncOp = mlir::dyn_cast<mlir::LLVM::LLVMFuncOp>(op))
     return &llvmFuncOp.front();
+  if (auto ompPrivateOp = mlir::dyn_cast<mlir::omp::PrivateClauseOp>(op))
+    return &ompPrivateOp.getAllocRegion().front();
+
   return getBlockForAllocaInsert(op->getParentOp());
 }
 
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index a39087aeb358..6773d0adced0 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -2498,10 +2498,8 @@ static constexpr llvm::StringRef getTargetOffsetAttr() {
 
 template <typename OpT>
 static mlir::LogicalResult verifyIntegralSwitchTerminator(OpT op) {
-  if (!op.getSelector()
-           .getType()
-           .template isa<mlir::IntegerType, mlir::IndexType,
-                         fir::IntegerType>())
+  if (!mlir::isa<mlir::IntegerType, mlir::IndexType, fir::IntegerType>(
+          op.getSelector().getType()))
     return op.emitOpError("must be an integer");
   auto cases =
       op->template getAttrOfType<mlir::ArrayAttr>(op.getCasesAttr()).getValue();
@@ -2576,7 +2574,7 @@ static void printIntegralSwitchTerminator(OpT op, mlir::OpAsmPrinter &p) {
     if (i)
       p << ", ";
     auto &attr = cases[i];
-    if (auto intAttr = attr.template dyn_cast_or_null<mlir::IntegerAttr>())
+    if (auto intAttr = mlir::dyn_cast_or_null<mlir::IntegerAttr>(attr))
       p << intAttr.getValue();
     else
       p.printAttribute(attr);
diff --git a/flang/lib/Optimizer/Dialect/FIRType.cpp b/flang/lib/Optimizer/Dialect/FIRType.cpp
index 38a6a09d1808..d9c387ad950e 100644
--- a/flang/lib/Optimizer/Dialect/FIRType.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRType.cpp
@@ -695,9 +695,9 @@ BoxProcType::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
 }
 
 static bool cannotBePointerOrHeapElementType(mlir::Type eleTy) {
-  return eleTy.isa<BoxType, BoxCharType, BoxProcType, ShapeType, ShapeShiftType,
+  return mlir::isa<BoxType, BoxCharType, BoxProcType, ShapeType, ShapeShiftType,
                    SliceType, FieldType, LenType, HeapType, PointerType,
-                   ReferenceType, TypeDescType>();
+                   ReferenceType, TypeDescType>(eleTy);
 }
 
 //===----------------------------------------------------------------------===//
@@ -776,10 +776,10 @@ void fir::CharacterType::print(mlir::AsmPrinter &printer) const {
 mlir::LogicalResult
 fir::ClassType::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
                        mlir::Type eleTy) {
-  if (eleTy.isa<fir::RecordType, fir::SequenceType, fir::HeapType,
+  if (mlir::isa<fir::RecordType, fir::SequenceType, fir::HeapType,
                 fir::PointerType, mlir::NoneType, mlir::IntegerType,
                 mlir::FloatType, fir::CharacterType, fir::LogicalType,
-                fir::ComplexType, mlir::ComplexType>())
+                fir::ComplexType, mlir::ComplexType>(eleTy))
     return mlir::success();
   return emitError() << "invalid element type\n";
 }
@@ -1050,8 +1050,8 @@ void fir::ReferenceType::print(mlir::AsmPrinter &printer) const {
 mlir::LogicalResult fir::ReferenceType::verify(
     llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
     mlir::Type eleTy) {
-  if (eleTy.isa<ShapeType, ShapeShiftType, SliceType, FieldType, LenType,
-                ReferenceType, TypeDescType>())
+  if (mlir::isa<ShapeType, ShapeShiftType, SliceType, FieldType, LenType,
+                ReferenceType, TypeDescType>(eleTy))
     return emitError() << "cannot build a reference to type: " << eleTy << '\n';
   return mlir::success();
 }
@@ -1126,9 +1126,9 @@ mlir::LogicalResult fir::SequenceType::verify(
     llvm::ArrayRef<int64_t> shape, mlir::Type eleTy,
     mlir::AffineMapAttr layoutMap) {
   // DIMENSION attribute can only be applied to an intrinsic or record type
-  if (eleTy.isa<BoxType, BoxCharType, BoxProcType, ShapeType, ShapeShiftType,
+  if (mlir::isa<BoxType, BoxCharType, BoxProcType, ShapeType, ShapeShiftType,
                 ShiftType, SliceType, FieldType, LenType, HeapType, PointerType,
-                ReferenceType, TypeDescType, SequenceType>())
+                ReferenceType, TypeDescType, SequenceType>(eleTy))
     return emitError() << "cannot build an array of this element type: "
                        << eleTy << '\n';
   return mlir::success();
@@ -1199,9 +1199,9 @@ void fir::TypeDescType::print(mlir::AsmPrinter &printer) const {
 mlir::LogicalResult fir::TypeDescType::verify(
     llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
     mlir::Type eleTy) {
-  if (eleTy.isa<BoxType, BoxCharType, BoxProcType, ShapeType, ShapeShiftType,
+  if (mlir::isa<BoxType, BoxCharType, BoxProcType, ShapeType, ShapeShiftType,
                 ShiftType, SliceType, FieldType, LenType, ReferenceType,
-                TypeDescType>())
+                TypeDescType>(eleTy))
     return emitError() << "cannot build a type descriptor of type: " << eleTy
                        << '\n';
   return mlir::success();
diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
index 601bf04ce5e9..c61179a7460e 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
@@ -1318,11 +1318,9 @@ void SimplifyIntrinsicsPass::runOnOperation() {
 
           // Support only floating point and integer arguments
           // now (e.g. logical is skipped here).
-          if (!arg1Type->isa<mlir::FloatType>() &&
-              !arg1Type->isa<mlir::IntegerType>())
+          if (!mlir::isa<mlir::FloatType, mlir::IntegerType>(*arg1Type))
             return;
-          if (!arg2Type->isa<mlir::FloatType>() &&
-              !arg2Type->isa<mlir::IntegerType>())
+          if (!mlir::isa<mlir::FloatType, mlir::IntegerType>(*arg2Type))
             return;
 
           auto typeGenerator = [&type](fir::FirOpBuilder &builder) {
diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt
index bdd0e07bbfd4..bc81e1b1887b 100644
--- a/flang/runtime/CMakeLists.txt
+++ b/flang/runtime/CMakeLists.txt
@@ -103,7 +103,6 @@ append(${NO_LTO_FLAGS} CMAKE_CXX_FLAGS)
 add_definitions(-U_GLIBCXX_ASSERTIONS)
 add_definitions(-U_LIBCPP_ENABLE_ASSERTIONS)
 
-add_subdirectory(FortranMain)
 add_subdirectory(Float128Math)
 
 set(sources
@@ -193,6 +192,7 @@ set(supported_files
   environment.cpp
   extrema.cpp
   external-unit.cpp
+  file.cpp
   findloc.cpp
   format.cpp
   inquiry.cpp
diff --git a/flang/runtime/FortranMain/CMakeLists.txt b/flang/runtime/FortranMain/CMakeLists.txt
deleted file mode 100644
index deb7bd10acf5..000000000000
--- a/flang/runtime/FortranMain/CMakeLists.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-add_flang_library(Fortran_main STATIC INSTALL_WITH_TOOLCHAIN
-  Fortran_main.c
-)
-if (DEFINED MSVC)
-  set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded)
-  add_flang_library(Fortran_main.static STATIC INSTALL_WITH_TOOLCHAIN
-    Fortran_main.c
-  )
-  set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDLL)
-  add_flang_library(Fortran_main.dynamic STATIC INSTALL_WITH_TOOLCHAIN
-    Fortran_main.c
-  )
-  set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDebug)
-  add_flang_library(Fortran_main.static_dbg STATIC INSTALL_WITH_TOOLCHAIN
-    Fortran_main.c
-  )
-  set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDebugDLL)
-  add_flang_library(Fortran_main.dynamic_dbg STATIC INSTALL_WITH_TOOLCHAIN
-    Fortran_main.c
-  )
-  add_dependencies(Fortran_main Fortran_main.static Fortran_main.dynamic
-    Fortran_main.static_dbg Fortran_main.dynamic_dbg)
-endif()
diff --git a/flang/runtime/FortranMain/Fortran_main.c b/flang/runtime/FortranMain/Fortran_main.c
deleted file mode 100644
index 5d3eaced001e..000000000000
--- a/flang/runtime/FortranMain/Fortran_main.c
+++ /dev/null
@@ -1,23 +0,0 @@
-//===-- runtime/FortranMain/Fortran_main.c --------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "flang/Runtime/main.h"
-#include "flang/Runtime/stop.h"
-
-/* main entry into PROGRAM */
-void _QQmain(void);
-
-extern const struct EnvironmentDefaultList *_QQEnvironmentDefaults;
-
-/* C main stub */
-int main(int argc, const char *argv[], const char *envp[]) {
-  RTNAME(ProgramStart)(argc, argv, envp, _QQEnvironmentDefaults);
-  _QQmain();
-  RTNAME(ProgramEndStatement)();
-  return 0;
-}
diff --git a/flang/runtime/environment.cpp b/flang/runtime/environment.cpp
index b2c9665a28df..52b1d99ba536 100644
--- a/flang/runtime/environment.cpp
+++ b/flang/runtime/environment.cpp
@@ -23,9 +23,11 @@ extern char **environ;
 
 namespace Fortran::runtime {
 
+#ifndef FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
 RT_OFFLOAD_VAR_GROUP_BEGIN
 RT_VAR_ATTRS ExecutionEnvironment executionEnvironment;
 RT_OFFLOAD_VAR_GROUP_END
+#endif // FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
 
 static void SetEnvironmentDefaults(const EnvironmentDefaultList *envDefaults) {
   if (!envDefaults) {
diff --git a/flang/runtime/file.cpp b/flang/runtime/file.cpp
index acd5d33d4bb8..79db17e70acd 100644
--- a/flang/runtime/file.cpp
+++ b/flang/runtime/file.cpp
@@ -457,22 +457,22 @@ std::int64_t SizeInBytes(const char *path) {
   return -1;
 }
 #else // defined(RT_DEVICE_COMPILATION)
-bool IsATerminal(int fd) {
+RT_API_ATTRS bool IsATerminal(int fd) {
   Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
 }
-bool IsExtant(const char *path) {
+RT_API_ATTRS bool IsExtant(const char *path) {
   Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
 }
-bool MayRead(const char *path) {
+RT_API_ATTRS bool MayRead(const char *path) {
   Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
 }
-bool MayWrite(const char *path) {
+RT_API_ATTRS bool MayWrite(const char *path) {
   Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
 }
-bool MayReadAndWrite(const char *path) {
+RT_API_ATTRS bool MayReadAndWrite(const char *path) {
   Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
 }
-std::int64_t SizeInBytes(const char *path) {
+RT_API_ATTRS std::int64_t SizeInBytes(const char *path) {
   Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
 }
 #endif // defined(RT_DEVICE_COMPILATION)
diff --git a/flang/runtime/namelist.cpp b/flang/runtime/namelist.cpp
index b9eed2101ecf..af092de70f78 100644
--- a/flang/runtime/namelist.cpp
+++ b/flang/runtime/namelist.cpp
@@ -596,7 +596,7 @@ bool IODEF(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
   return true;
 }
 
-bool IsNamelistNameOrSlash(IoStatementState &io) {
+RT_API_ATTRS bool IsNamelistNameOrSlash(IoStatementState &io) {
   if (auto *listInput{
           io.get_if<ListDirectedStatementState<Direction::Input>>()}) {
     if (listInput->inNamelistSequence()) {
diff --git a/flang/runtime/unit.cpp b/flang/runtime/unit.cpp
index 0e38cffdf907..3b42f45d5588 100644
--- a/flang/runtime/unit.cpp
+++ b/flang/runtime/unit.cpp
@@ -19,11 +19,13 @@
 
 namespace Fortran::runtime::io {
 
+#ifndef FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
 RT_OFFLOAD_VAR_GROUP_BEGIN
 RT_VAR_ATTRS ExternalFileUnit *defaultInput{nullptr}; // unit 5
 RT_VAR_ATTRS ExternalFileUnit *defaultOutput{nullptr}; // unit 6
 RT_VAR_ATTRS ExternalFileUnit *errorOutput{nullptr}; // unit 0 extension
 RT_OFFLOAD_VAR_GROUP_END
+#endif // FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
 
 RT_OFFLOAD_API_GROUP_BEGIN
 
diff --git a/flang/runtime/utf.cpp b/flang/runtime/utf.cpp
index 9945dc6509ec..f4b38d5225ce 100644
--- a/flang/runtime/utf.cpp
+++ b/flang/runtime/utf.cpp
@@ -10,6 +10,7 @@
 
 namespace Fortran::runtime {
 
+#ifndef FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
 // clang-format off
 RT_OFFLOAD_VAR_GROUP_BEGIN
 const RT_CONST_VAR_ATTRS std::uint8_t UTF8FirstByteTable[256]{
@@ -40,6 +41,7 @@ const RT_CONST_VAR_ATTRS std::uint8_t UTF8FirstByteTable[256]{
 };
 RT_OFFLOAD_VAR_GROUP_END
 // clang-format on
+#endif // FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
 
 RT_OFFLOAD_API_GROUP_BEGIN
 // Non-minimal encodings are accepted.
diff --git a/flang/test/CMakeLists.txt b/flang/test/CMakeLists.txt
index 7d96a72e5f36..7e036ad539df 100644
--- a/flang/test/CMakeLists.txt
+++ b/flang/test/CMakeLists.txt
@@ -62,7 +62,6 @@ set(FLANG_TEST_DEPENDS
   llvm-readobj
   split-file
   FortranRuntime
-  Fortran_main
   FortranDecimal
 )
 if (LLVM_ENABLE_PLUGINS AND NOT WIN32)
diff --git a/flang/test/Driver/bbc-mlir-pass-pipeline.f90 b/flang/test/Driver/bbc-mlir-pass-pipeline.f90
index 7a35e26dc478..caa86e66e62b 100644
--- a/flang/test/Driver/bbc-mlir-pass-pipeline.f90
+++ b/flang/test/Driver/bbc-mlir-pass-pipeline.f90
@@ -17,7 +17,7 @@ end program
 ! CHECK-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 ! CHECK-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
-! CHECK-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+! CHECK-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 ! CHECK-NEXT: 'fir.global' Pipeline
 ! CHECK-NEXT:   CharacterConversion
 ! CHECK-NEXT: 'func.func' Pipeline
@@ -25,6 +25,8 @@ end program
 ! CHECK-NEXT:   CharacterConversion
 ! CHECK-NEXT: 'omp.declare_reduction' Pipeline
 ! CHECK-NEXT:   CharacterConversion
+! CHECK-NEXT: 'omp.private' Pipeline
+! CHECK-NEXT:   CharacterConversion
 
 ! CHECK-NEXT: Canonicalizer
 ! CHECK-NEXT: SimplifyRegionLite
@@ -43,7 +45,7 @@ end program
 ! CHECK-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 ! CHECK-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
-! CHECK-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+! CHECK-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 ! CHECK-NEXT: 'fir.global' Pipeline
 ! CHECK-NEXT:   CFGConversion
 ! CHECK-NEXT: 'func.func' Pipeline
@@ -51,6 +53,8 @@ end program
 ! CHECK-NEXT:   CFGConversion
 ! CHECK-NEXT: 'omp.declare_reduction' Pipeline
 ! CHECK-NEXT:   CFGConversion
+! CHECK-NEXT: 'omp.private' Pipeline
+! CHECK-NEXT:   CFGConversion
 
 ! CHECK-NEXT: SCFToControlFlow
 ! CHECK-NEXT: Canonicalizer
diff --git a/flang/test/Driver/driver-help-hidden.f90 b/flang/test/Driver/driver-help-hidden.f90
new file mode 100644
index 000000000000..706b2cb6c245
--- /dev/null
+++ b/flang/test/Driver/driver-help-hidden.f90
@@ -0,0 +1,172 @@
+
+!--------------------------
+! FLANG DRIVER (flang-new)
+!--------------------------
+! RUN: %flang --help-hidden 2>&1 | FileCheck %s
+! RUN: not %flang  -help-hidden 2>&1 | FileCheck %s --check-prefix=ERROR-FLANG
+
+!----------------------------------------
+! FLANG FRONTEND DRIVER (flang-new -fc1)
+!----------------------------------------
+! RUN: not %flang_fc1 --help-hidden 2>&1 | FileCheck %s --check-prefix=ERROR-FLANG-FC1
+! RUN: not %flang_fc1  -help-hidden 2>&1 | FileCheck %s --check-prefix=ERROR-FLANG-FC1
+
+! CHECK:USAGE: flang-new
+! CHECK-EMPTY:
+! CHECK-NEXT: DRIVER OPTIONS:
+! CHECK-NEXT:  --driver-mode=<value> Set the driver mode to either 'gcc', 'g++', 'cpp', 'cl' or 'flang'
+! CHECK-EMPTY:
+! CHECK-NEXT:OPTIONS:
+! CHECK-NEXT: -###                    Print (but do not run) the commands to run for this compilation
+! CHECK-NEXT: -ccc-print-phases       Dump list of actions to perform
+! CHECK-NEXT: -cpp                    Enable predefined and command line preprocessor macros
+! CHECK-NEXT: -c                      Only run preprocess, compile, and assemble steps
+! CHECK-NEXT: -dM                     Print macro definitions in -E mode instead of normal output
+! CHECK-NEXT: -dumpmachine            Display the compiler's target processor
+! CHECK-NEXT: -dumpversion            Display the version of the compiler
+! CHECK-NEXT: -D <macro>=<value>      Define <macro> to <value> (or 1 if <value> omitted)
+! CHECK-NEXT: -emit-llvm              Use the LLVM representation for assembler and object files
+! CHECK-NEXT: -E                      Only run the preprocessor
+! CHECK-NEXT: -falternative-parameter-statement
+! CHECK-NEXT:                         Enable the old style PARAMETER statement
+! CHECK-NEXT: -fapprox-func           Allow certain math function calls to be replaced with an approximately equivalent calculation
+! CHECK-NEXT: -fbackslash             Specify that backslash in string introduces an escape character
+! CHECK-NEXT: -fcolor-diagnostics     Enable colors in diagnostics
+! CHECK-NEXT: -fconvert=<value>       Set endian conversion of data for unformatted files
+! CHECK-NEXT: -fdefault-double-8      Set the default double precision kind to an 8 byte wide type
+! CHECK-NEXT: -fdefault-integer-8     Set the default integer and logical kind to an 8 byte wide type
+! CHECK-NEXT: -fdefault-real-8        Set the default real kind to an 8 byte wide type
+! CHECK-NEXT: -ffast-math             Allow aggressive, lossy floating-point optimizations
+! CHECK-NEXT: -ffixed-form            Process source files in fixed form
+! CHECK-NEXT: -ffixed-line-length=<value>
+! CHECK-NEXT:                         Use <value> as character line width in fixed mode
+! CHECK-NEXT: -ffp-contract=<value>   Form fused FP ops (e.g. FMAs)
+! CHECK-NEXT: -ffree-form             Process source files in free form
+! CHECK-NEXT: -fhonor-infinities      Specify that floating-point optimizations are not allowed that assume arguments and results are not +-inf.
+! CHECK-NEXT: -fhonor-nans            Specify that floating-point optimizations are not allowed that assume arguments and results are not NANs.
+! CHECK-NEXT: -fimplicit-none         No implicit typing allowed unless overridden by IMPLICIT statements
+! CHECK-NEXT: -finput-charset=<value> Specify the default character set for source files
+! CHECK-NEXT: -fintegrated-as         Enable the integrated assembler
+! CHECK-NEXT: -fintrinsic-modules-path <dir>
+! CHECK-NEXT:                         Specify where to find the compiled intrinsic modules
+! CHECK-NEXT: -flang-deprecated-no-hlfir
+! CHECK-NEXT:                         Do not use HLFIR lowering (deprecated)
+! CHECK-NEXT: -flang-experimental-hlfir
+! CHECK-NEXT:                         Use HLFIR lowering (experimental)
+! CHECK-NEXT: -flarge-sizes           Use INTEGER(KIND=8) for the result type in size-related intrinsics
+! CHECK-NEXT: -flogical-abbreviations Enable logical abbreviations
+! CHECK-NEXT: -flto=auto              Enable LTO in 'full' mode
+! CHECK-NEXT: -flto=jobserver         Enable LTO in 'full' mode
+! CHECK-NEXT: -flto=<value>           Set LTO mode
+! CHECK-NEXT: -flto                   Enable LTO in 'full' mode
+! CHECK-NEXT: -fms-runtime-lib=<value>
+! CHECK-NEXT:                         Select Windows run-time library
+! CHECK-NEXT: -fno-automatic          Implies the SAVE attribute for non-automatic local objects in subprograms unless RECURSIVE
+! CHECK-NEXT: -fno-color-diagnostics  Disable colors in diagnostics
+! CHECK-NEXT: -fno-integrated-as      Disable the integrated assembler
+! CHECK-NEXT: -fno-lto                Disable LTO mode (default)
+! CHECK-NEXT: -fno-ppc-native-vector-element-order
+! CHECK-NEXT:                         Specifies PowerPC non-native vector element order
+! CHECK-NEXT: -fno-rtlib-add-rpath Do not add -rpath with architecture-specific resource directory to the linker flags. When --hip-link is specified, do not add -rpath with HIP runtime library directory to the linker flags
+! CHECK-NEXT: -fno-signed-zeros       Allow optimizations that ignore the sign of floating point zeros
+! CHECK-NEXT: -fno-stack-arrays       Allocate array temporaries on the heap (default)
+! CHECK-NEXT: -fno-version-loops-for-stride
+! CHECK-NEXT:                         Do not create unit-strided loops (default)
+! CHECK-NEXT: -fomit-frame-pointer    Omit the frame pointer from functions that don't need it. Some stack unwinding cases, such as profilers and sanitizers, may prefer specifying -fno-omit-frame-pointer. On many targets, -O1 and higher omit the frame pointer by default. -m[no-]omit-leaf-frame-pointer takes precedence for leaf functions
+! CHECK-NEXT: -fopenacc               Enable OpenACC
+! CHECK-NEXT: -fopenmp-assume-no-nested-parallelism
+! CHECK-NEXT:                         Assert no nested parallel regions in the GPU
+! CHECK-NEXT: -fopenmp-assume-no-thread-state
+! CHECK-NEXT:                         Assert no thread in a parallel region modifies an ICV
+! CHECK-NEXT: -fopenmp-target-debug   Enable debugging in the OpenMP offloading device RTL
+! CHECK-NEXT: -fopenmp-targets=<value>
+! CHECK-NEXT:                         Specify comma-separated list of triples OpenMP offloading targets to be supported
+! CHECK-NEXT: -fopenmp-version=<value>
+! CHECK-NEXT:                         Set OpenMP version (e.g. 45 for OpenMP 4.5, 51 for OpenMP 5.1). Default value is 11 for Flang
+! CHECK-NEXT: -fopenmp                Parse OpenMP pragmas and generate parallel code.
+! CHECK-NEXT: -foptimization-record-file=<file>
+! CHECK-NEXT:                         Specify the output name of the file containing the optimization remarks. Implies -fsave-optimization-record. On Darwin platforms, this cannot be used with multiple -arch <arch> options.
+! CHECK-NEXT: -foptimization-record-passes=<regex>
+! CHECK-NEXT:                         Only include passes which match a specified regular expression in the generated optimization record (by default, include all passes)
+! CHECK-NEXT: -fpass-plugin=<dsopath> Load pass plugin from a dynamic shared object file (only with new pass manager).
+! CHECK-NEXT: -fppc-native-vector-element-order
+! CHECK-NEXT:                         Specifies PowerPC native vector element order (default)
+! CHECK-NEXT: -freciprocal-math       Allow division operations to be reassociated
+! CHECK-NEXT: -fropi                  Generate read-only position independent code (ARM only)
+! CHECK-NEXT: -frtlib-add-rpath Add -rpath with architecture-specific resource directory to the linker flags. When --hip-link is specified, also add -rpath with HIP runtime library directory to the linker flags
+! CHECK-NEXT: -frwpi                  Generate read-write position independent code (ARM only)
+! CHECK-NEXT: -fsave-optimization-record=<format>
+! CHECK-NEXT:                         Generate an optimization record file in a specific format
+! CHECK-NEXT: -fsave-optimization-record
+! CHECK-NEXT:                         Generate a YAML optimization record file
+! CHECK-NEXT: -fstack-arrays          Attempt to allocate array temporaries on the stack, no matter their size
+! CHECK-NEXT: -fsyntax-only           Run the preprocessor, parser and semantic analysis stages
+! CHECK-NEXT: -funderscoring          Appends one trailing underscore to external names
+! CHECK-NEXT: -fveclib=<value>        Use the given vector functions library
+! CHECK-NEXT: -fversion-loops-for-stride
+! CHECK-NEXT:                         Create unit-strided versions of loops
+! CHECK-NEXT: -fxor-operator          Enable .XOR. as a synonym of .NEQV.
+! CHECK-NEXT: --gcc-install-dir=<value>
+! CHECK-NEXT:                         Use GCC installation in the specified directory. The directory ends with path components like 'lib{,32,64}/gcc{,-cross}/$triple/$version'. Note: executables (e.g. ld) used by the compiler are not overridden by the selected GCC installation
+! CHECK-NEXT: --gcc-toolchain=<value> Specify a directory where Flang can find 'lib{,32,64}/gcc{,-cross}/$triple/$version'. Flang will use the GCC installation with the largest version
+! CHECK-NEXT: -gline-directives-only  Emit debug line info directives only
+! CHECK-NEXT: -gline-tables-only      Emit debug line number tables only
+! CHECK-NEXT: -gpulibc                Link the LLVM C Library for GPUs
+! CHECK-NEXT: -g                      Generate source-level debug information
+! CHECK-NEXT: --help-hidden           Display help for hidden options
+! CHECK-NEXT: -help                   Display available options
+! CHECK-NEXT: -isysroot <dir>         Set the system root directory (usually /)
+! CHECK-NEXT: -I <dir>                Add directory to the end of the list of include search paths
+! CHECK-NEXT: -L <dir>                Add directory to library search path
+! CHECK-NEXT: -march=<value>          For a list of available architectures for the target use '-mcpu=help'
+! CHECK-NEXT: -mcode-object-version=<value>
+! CHECK-NEXT:                         Specify code object ABI version. Defaults to 5. (AMDGPU only)
+! CHECK-NEXT: -mcpu=<value>           For a list of available CPUs for the target use '-mcpu=help'
+! CHECK-NEXT: -mllvm=<arg>            Alias for -mllvm
+! CHECK-NEXT: -mllvm <value>          Additional arguments to forward to LLVM's option processing
+! CHECK-NEXT: -mmlir <value>          Additional arguments to forward to MLIR's option processing
+! CHECK-NEXT: -mno-outline-atomics    Don't generate local calls to out-of-line atomic operations
+! CHECK-NEXT: -module-dir <dir>       Put MODULE files in <dir>
+! CHECK-NEXT: -moutline-atomics       Generate local calls to out-of-line atomic operations
+! CHECK-NEXT: -mrvv-vector-bits=<value>
+! CHECK-NEXT:                         Specify the size in bits of an RVV vector register
+! CHECK-NEXT: -msve-vector-bits=<value>
+! CHECK-NEXT:                          Specify the size in bits of an SVE vector register. Defaults to the vector length agnostic value of "scalable". (AArch64 only)
+! CHECK-NEXT: --no-offload-arch=<value>
+! CHECK-NEXT:                         Remove CUDA/HIP offloading device architecture (e.g. sm_35, gfx906) from the list of devices to compile for. 'all' resets the list to its default value.
+! CHECK-NEXT: -nocpp                  Disable predefined and command line preprocessor macros
+! CHECK-NEXT: -nogpulib               Do not link device library for CUDA/HIP device compilation
+! CHECK-NEXT: --offload-arch=<value>  Specify an offloading device architecture for CUDA, HIP, or OpenMP. (e.g. sm_35). If 'native' is used the compiler will detect locally installed architectures. For HIP offloading, the device architecture can be followed by target ID features delimited by a colon (e.g. gfx908:xnack+:sramecc-). May be specified more than once.
+! CHECK-NEXT: --offload-device-only   Only compile for the offloading device.
+! CHECK-NEXT: --offload-host-device   Compile for both the offloading host and device (default).
+! CHECK-NEXT: --offload-host-only     Only compile for the offloading host.
+! CHECK-NEXT: -o <file>               Write output to <file>
+! CHECK-NEXT: -pedantic               Warn on language extensions
+! CHECK-NEXT: -print-effective-triple Print the effective target triple
+! CHECK-NEXT: -print-target-triple    Print the normalized target triple
+! CHECK-NEXT: -pthread                Support POSIX threads in generated code
+! CHECK-NEXT: -P                      Disable linemarker output in -E mode
+! CHECK-NEXT: -resource-dir <value>   The directory which holds the compiler resource files
+! CHECK-NEXT: --rocm-path=<value> ROCm installation path, used for finding and automatically linking required bitcode libraries.
+! CHECK-NEXT: -Rpass-analysis=<value> Report transformation analysis from optimization passes whose name matches the given POSIX regular expression
+! CHECK-NEXT: -Rpass-missed=<value>   Report missed transformations by optimization passes whose name matches the given POSIX regular expression
+! CHECK-NEXT: -Rpass=<value>          Report transformations performed by optimization passes whose name matches the given POSIX regular expression
+! CHECK-NEXT: -R<remark>              Enable the specified remark
+! CHECK-NEXT: -save-temps=<value>     Save intermediate compilation results.
+! CHECK-NEXT: -save-temps             Alias for --save-temps=cwd
+! CHECK-NEXT: -std=<value>            Language standard to compile for
+! CHECK-NEXT: -S                      Only run preprocess and compilation steps
+! CHECK-NEXT: --target=<value>        Generate code for the given target
+! CHECK-NEXT: -U <macro>              Undefine macro <macro>
+! CHECK-NEXT: --version               Print version information
+! CHECK-NEXT: -v                      Show commands to run and use verbose output
+! CHECK-NEXT: -Wl,<arg>               Pass the comma separated arguments in <arg> to the linker
+! CHECK-NEXT: -W<warning>             Enable the specified warning
+! CHECK-NEXT: -Xflang <arg>           Pass <arg> to the flang compiler
+! CHECK-NEXT: -x <language>           Treat subsequent input files as having type <language>
+
+
+! ERROR-FLANG: error: unknown argument '-help-hidden'; did you mean '--help-hidden'?
+
+! Frontend driver -help-hidden is not supported
+! ERROR-FLANG-FC1: error: unknown argument: '{{.*}}'
diff --git a/flang/test/Driver/dynamic-linker.f90 b/flang/test/Driver/dynamic-linker.f90
index 7c3f1b5a53fe..6d5c443ab75c 100644
--- a/flang/test/Driver/dynamic-linker.f90
+++ b/flang/test/Driver/dynamic-linker.f90
@@ -16,7 +16,6 @@
 ! GNU-LINKER-OPTIONS-SAME: "-shared"
 ! GNU-LINKER-OPTIONS-SAME: "-static"
 ! GNU-LINKER-OPTIONS-SAME: "-rpath" "/path/to/dir"
-! GNU-LINKER-OPTIONS-NOT: "-lFortran_main.a"
 
 ! RDYNAMIC-LINKER-OPTION: "{{.*}}ld"
 ! RDYNAMIC-LINKER-OPTION-SAME: "-export-dynamic"
@@ -25,4 +24,3 @@
 ! MSVC-LINKER-OPTIONS: "{{.*}}link{{(.exe)?}}"
 ! MSVC-LINKER-OPTIONS-SAME: "-dll"
 ! MSVC-LINKER-OPTIONS-SAME: "-rpath" "/path/to/dir"
-! MSVC-LINKER-OPTIONS-NOT: "/WHOLEARCHIVE:Fortran_main"
diff --git a/flang/test/Driver/emit-mlir.f90 b/flang/test/Driver/emit-mlir.f90
index 191ee13396ef..83bb8fc1eddc 100644
--- a/flang/test/Driver/emit-mlir.f90
+++ b/flang/test/Driver/emit-mlir.f90
@@ -19,6 +19,16 @@
 ! CHECK-NEXT:  %[[VAL_0:.*]] = fir.zero_bits !fir.ref<tuple<i[[int_size]], !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>
 ! CHECK-NEXT: fir.has_value  %[[VAL_0]] : !fir.ref<tuple<i[[int_size]], !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>
 ! CHECK-NEXT: }
+! CHECK-NEXT: func.func private @_FortranAProgramStart(i32, !llvm.ptr, !llvm.ptr, !llvm.ptr)
+! CHECK-NEXT: func.func private @_FortranAProgramEndStatement()
+! CHECK-NEXT: func.func @main(%arg0: i32, %arg1: !llvm.ptr, %arg2: !llvm.ptr) -> i32 {
+! CHECK-NEXT: %c0_i32 = arith.constant 0 : i32
+! CHECK-NEXT: %0 = fir.address_of(@_QQEnvironmentDefaults) : !fir.ref<tuple<i32, !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>
+! CHECK-NEXT: ir.call @_FortranAProgramStart(%arg0, %arg1, %arg2, %0) {{.*}} : (i32, !llvm.ptr, !llvm.ptr, !fir.ref<tuple<i32, !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>) 
+! CHECK-NEXT: fir.call @_QQmain() fastmath<contract> : () -> ()
+! CHECK-NEXT: fir.call @_FortranAProgramEndStatement() {{.*}} : () -> ()
+! CHECK-NEXT: return %c0_i32 : i32
+! CHECK-NEXT: }
 ! CHECK-NEXT: }
 
 end program
diff --git a/flang/test/Driver/linker-flags.f90 b/flang/test/Driver/linker-flags.f90
index 4d3d528b5e99..02e217494f81 100644
--- a/flang/test/Driver/linker-flags.f90
+++ b/flang/test/Driver/linker-flags.f90
@@ -11,7 +11,6 @@
 ! RUN: %flang -### --target=x86_64-unknown-dragonfly %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,UNIX-F128%f128-lib
 ! RUN: %flang -### --target=x86_64-unknown-haiku %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,HAIKU,HAIKU-F128%f128-lib
 ! RUN: %flang -### --target=x86_64-windows-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,MINGW,MINGW-F128%f128-lib
-! RUN: %flang -### --target=aarch64-unknown-linux-gnu %S/Inputs/hello.f90 -lFortran_main 2>&1 | FileCheck %s --check-prefixes=DEPRECATED
 
 ! NOTE: Clang's driver library, clangDriver, usually adds 'oldnames' on Windows,
 !       but it is not needed when compiling Fortran code and they might bring in
@@ -29,7 +28,6 @@
 !       executable and may find the GNU linker from MinGW or Cygwin.
 ! UNIX-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! UNIX-SAME: "[[object_file]]"
-! UNIX-SAME: "--whole-archive" "-lFortran_main" "--no-whole-archive"
 ! UNIX-F128NONE-NOT: FortranFloat128Math
 ! SOLARIS-F128NONE-NOT: FortranFloat128Math
 ! UNIX-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed"
@@ -38,7 +36,6 @@
 
 ! DARWIN-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! DARWIN-SAME: "[[object_file]]"
-! DARWIN-SAME: -lFortran_main
 ! DARWIN-F128NONE-NOT: FortranFloat128Math
 ! DARWIN-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed"
 ! DARWIN-SAME: -lFortranRuntime
@@ -46,14 +43,12 @@
 
 ! HAIKU-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! HAIKU-SAME: "[[object_file]]"
-! HAIKU-SAME: "--whole-archive" "-lFortran_main" "--no-whole-archive"
 ! HAIKU-F128NONE-NOT: FortranFloat128Math
 ! HAIKU-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed"
 ! HAIKU-SAME: "-lFortranRuntime" "-lFortranDecimal"
 
 ! MINGW-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! MINGW-SAME: "[[object_file]]"
-! MINGW-SAME: -lFortran_main
 ! MINGW-F128NONE-NOT: FortranFloat128Math
 ! MINGW-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed"
 ! MINGW-SAME: -lFortranRuntime
@@ -66,6 +61,3 @@
 ! MSVC-LABEL: link
 ! MSVC-SAME: /subsystem:console
 ! MSVC-SAME: "[[object_file]]"
-
-! Check that we warn when using -lFortran_main
-! DEPRECATED: warning: argument '-lFortran_main' is deprecated, see the Flang driver documentation for correct usage [-Wdeprecated]
diff --git a/flang/test/Driver/mlir-debug-pass-pipeline.f90 b/flang/test/Driver/mlir-debug-pass-pipeline.f90
index 28d70bc15264..2c81441e7ec9 100644
--- a/flang/test/Driver/mlir-debug-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-debug-pass-pipeline.f90
@@ -39,7 +39,7 @@ end program
 ! ALL-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
-! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT: 'fir.global' Pipeline
 ! ALL-NEXT:   CharacterConversion
 ! ALL-NEXT: 'func.func' Pipeline
@@ -47,6 +47,8 @@ end program
 ! ALL-NEXT:   CharacterConversion
 ! ALL-NEXT: 'omp.declare_reduction' Pipeline
 ! ALL-NEXT:   CharacterConversion
+! ALL-NEXT: 'omp.private' Pipeline
+! ALL-NEXT:   CharacterConversion
 
 ! ALL-NEXT: Canonicalizer
 ! ALL-NEXT: SimplifyRegionLite
@@ -63,7 +65,7 @@ end program
 ! ALL-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
-! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT:   'fir.global' Pipeline
 ! ALL-NEXT:     CFGConversion
 ! ALL-NEXT:   'func.func' Pipeline
@@ -71,6 +73,8 @@ end program
 ! ALL-NEXT:     CFGConversion
 ! ALL-NEXT:   'omp.declare_reduction' Pipeline
 ! ALL-NEXT:     CFGConversion
+! ALL-NEXT:   'omp.private' Pipeline
+! ALL-NEXT:     CFGConversion
 ! ALL-NEXT: SCFToControlFlow
 ! ALL-NEXT: Canonicalizer
 ! ALL-NEXT: SimplifyRegionLite
@@ -79,13 +83,15 @@ end program
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 ! ALL-NEXT: BoxedProcedurePass
 
-! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT:   'fir.global' Pipeline
 ! ALL-NEXT:     AbstractResultOpt
 ! ALL-NEXT:   'func.func' Pipeline
 ! ALL-NEXT:     AbstractResultOpt
 ! ALL-NEXT:   'omp.declare_reduction' Pipeline
 ! ALL-NEXT:     AbstractResultOpt
+! ALL-NEXT:   'omp.private' Pipeline
+! ALL-NEXT:     AbstractResultOpt
 
 ! ALL-NEXT: CodeGenRewrite
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations eliminated
diff --git a/flang/test/Driver/mlir-pass-pipeline.f90 b/flang/test/Driver/mlir-pass-pipeline.f90
index 41f3c203e435..320467a2ac2a 100644
--- a/flang/test/Driver/mlir-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-pass-pipeline.f90
@@ -28,7 +28,7 @@ end program
 ! ALL-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
-! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT: 'fir.global' Pipeline
 ! ALL-NEXT:   CharacterConversion
 ! ALL-NEXT: 'func.func' Pipeline
@@ -36,6 +36,8 @@ end program
 ! ALL-NEXT:   CharacterConversion
 ! ALL-NEXT: 'omp.declare_reduction' Pipeline
 ! ALL-NEXT:   CharacterConversion
+! ALL-NEXT: 'omp.private' Pipeline
+! ALL-NEXT:   CharacterConversion
 
 ! ALL-NEXT: Canonicalizer
 ! ALL-NEXT: SimplifyRegionLite
@@ -57,7 +59,7 @@ end program
 ! O2-NEXT:    'func.func' Pipeline
 ! O2-NEXT:      PolymorphicOpConversion
 ! O2-NEXT:  AddAliasTags
-! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT:    'fir.global' Pipeline
 ! ALL-NEXT:      CFGConversion
 ! ALL-NEXT:    'func.func' Pipeline
@@ -65,6 +67,8 @@ end program
 ! ALL-NEXT:      CFGConversion
 ! ALL-NEXT:   'omp.declare_reduction' Pipeline
 ! ALL-NEXT:      CFGConversion
+! ALL-NEXT:   'omp.private' Pipeline
+! ALL-NEXT:      CFGConversion
 
 ! ALL-NEXT: SCFToControlFlow
 ! ALL-NEXT: Canonicalizer
@@ -74,13 +78,15 @@ end program
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 ! ALL-NEXT: BoxedProcedurePass
 
-! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT:   'fir.global' Pipeline
 ! ALL-NEXT:    AbstractResultOpt
 ! ALL-NEXT:  'func.func' Pipeline
 ! ALL-NEXT:    AbstractResultOpt
 ! ALL-NEXT:  'omp.declare_reduction' Pipeline
 ! ALL-NEXT:    AbstractResultOpt
+! ALL-NEXT:  'omp.private' Pipeline
+! ALL-NEXT:    AbstractResultOpt
 
 ! ALL-NEXT: CodeGenRewrite
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations eliminated
diff --git a/flang/test/Driver/msvc-dependent-lib-flags.f90 b/flang/test/Driver/msvc-dependent-lib-flags.f90
index 6cfc969e92b2..765917f07d8e 100644
--- a/flang/test/Driver/msvc-dependent-lib-flags.f90
+++ b/flang/test/Driver/msvc-dependent-lib-flags.f90
@@ -7,7 +7,6 @@
 ! MSVC-SAME: --dependent-lib=clang_rt.builtins.lib
 ! MSVC-SAME: -D_MT
 ! MSVC-SAME: --dependent-lib=libcmt
-! MSVC-SAME: --dependent-lib=Fortran_main.static.lib
 ! MSVC-SAME: --dependent-lib=FortranRuntime.static.lib
 ! MSVC-SAME: --dependent-lib=FortranDecimal.static.lib
 
@@ -16,7 +15,6 @@
 ! MSVC-DEBUG-SAME: -D_MT
 ! MSVC-DEBUG-SAME: -D_DEBUG
 ! MSVC-DEBUG-SAME: --dependent-lib=libcmtd
-! MSVC-DEBUG-SAME: --dependent-lib=Fortran_main.static_dbg.lib
 ! MSVC-DEBUG-SAME: --dependent-lib=FortranRuntime.static_dbg.lib
 ! MSVC-DEBUG-SAME: --dependent-lib=FortranDecimal.static_dbg.lib
 
@@ -25,7 +23,6 @@
 ! MSVC-DLL-SAME: -D_MT
 ! MSVC-DLL-SAME: -D_DLL
 ! MSVC-DLL-SAME: --dependent-lib=msvcrt
-! MSVC-DLL-SAME: --dependent-lib=Fortran_main.dynamic.lib
 ! MSVC-DLL-SAME: --dependent-lib=FortranRuntime.dynamic.lib
 ! MSVC-DLL-SAME: --dependent-lib=FortranDecimal.dynamic.lib
 
@@ -35,6 +32,5 @@
 ! MSVC-DLL-DEBUG-SAME: -D_DEBUG
 ! MSVC-DLL-DEBUG-SAME: -D_DLL
 ! MSVC-DLL-DEBUG-SAME: --dependent-lib=msvcrtd
-! MSVC-DLL-DEBUG-SAME: --dependent-lib=Fortran_main.dynamic_dbg.lib
 ! MSVC-DLL-DEBUG-SAME: --dependent-lib=FortranRuntime.dynamic_dbg.lib
 ! MSVC-DLL-DEBUG-SAME: --dependent-lib=FortranDecimal.dynamic_dbg.lib
diff --git a/flang/test/Driver/no-duplicate-main.f90 b/flang/test/Driver/no-duplicate-main.f90
index 88f4430828e0..b0bb6c2a2fef 100644
--- a/flang/test/Driver/no-duplicate-main.f90
+++ b/flang/test/Driver/no-duplicate-main.f90
@@ -4,8 +4,6 @@
 ! RUN: %flang -o %t -c %s
 ! RUN: not %flang -o %t.exe %t %t.c-object 2>&1
 
-! RUN: %flang -fno-fortran-main -o %t.exe %t %t.c-object 2>&1
-
 ! TODO: potentially add further checks to ensure that proper
 !       linker error messages are detected and checked via
 !       FileCheck.
diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
index 7508963a3d51..d54b0895cc33 100644
--- a/flang/test/Fir/basic-program.fir
+++ b/flang/test/Fir/basic-program.fir
@@ -34,7 +34,7 @@ func.func @_QQmain() {
 // PASSES-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 // PASSES-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
-// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 // PASSES-NEXT: 'fir.global' Pipeline
 // PASSES-NEXT:   CharacterConversion
 // PASSES-NEXT: 'func.func' Pipeline
@@ -42,6 +42,8 @@ func.func @_QQmain() {
 // PASSES-NEXT:   CharacterConversion
 // PASSES-NEXT: 'omp.declare_reduction' Pipeline
 // PASSES-NEXT:   CharacterConversion
+// PASSES-NEXT: 'omp.private' Pipeline
+// PASSES-NEXT:   CharacterConversion
 
 // PASSES-NEXT: Canonicalizer
 // PASSES-NEXT: SimplifyRegionLite
@@ -65,13 +67,15 @@ func.func @_QQmain() {
 
 // PASSES-NEXT: AddAliasTags
 
-// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 // PASSES-NEXT: 'fir.global' Pipeline
 // PASSES-NEXT:   CFGConversion
 // PASSES-NEXT: 'func.func' Pipeline
 // PASSES-NEXT:   CFGConversion
 // PASSES-NEXT: 'omp.declare_reduction' Pipeline
 // PASSES-NEXT:   CFGConversion
+// PASSES-NEXT: 'omp.private' Pipeline
+// PASSES-NEXT:   CFGConversion
 
 // PASSES-NEXT: SCFToControlFlow
 // PASSES-NEXT: Canonicalizer
@@ -81,13 +85,15 @@ func.func @_QQmain() {
 // PASSES-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 // PASSES-NEXT: BoxedProcedurePass
 
-// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
-// PASSES-NEXT:   'fir.global' Pipeline
+// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+// PASSES-NEXT:  'fir.global' Pipeline
 // PASSES-NEXT:    AbstractResultOpt
 // PASSES-NEXT:  'func.func' Pipeline
 // PASSES-NEXT:    AbstractResultOpt
 // PASSES-NEXT:  'omp.declare_reduction' Pipeline
 // PASSES-NEXT:    AbstractResultOpt
+// PASSES-NEXT:  'omp.private' Pipeline
+// PASSES-NEXT:    AbstractResultOpt
 
 // PASSES-NEXT: CodeGenRewrite
 // PASSES-NEXT:   (S) 0 num-dce'd - Number of operations eliminated
diff --git a/flang/test/Lower/OpenMP/cfg-conversion-omp.private.f90 b/flang/test/Lower/OpenMP/cfg-conversion-omp.private.f90
new file mode 100644
index 000000000000..7f1087a7ebe3
--- /dev/null
+++ b/flang/test/Lower/OpenMP/cfg-conversion-omp.private.f90
@@ -0,0 +1,54 @@
+! Tests that CFG & LLVM conversion is applied to `omp.private` ops.
+
+! RUN: split-file %s %t && cd %t
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN:   -o - test.f90 2>&1 | \
+! RUN: fir-opt --cfg-conversion -o test.cfg-conv.mlir
+! RUN: FileCheck --input-file=test.cfg-conv.mlir %s --check-prefix="CFGConv"
+
+! RUN: fir-opt --convert-hlfir-to-fir --cg-rewrite --fir-to-llvm-ir test.cfg-conv.mlir -o - | \
+! RUN: FileCheck %s --check-prefix="LLVMDialect"
+
+!--- test.f90
+subroutine delayed_privatization_allocatable
+  implicit none
+  integer, allocatable :: var1
+
+!$omp parallel private(var1)
+  var1 = 10
+!$omp end parallel
+end subroutine
+
+! CFGConv-LABEL: omp.private {type = private}
+! CFGConv-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]] alloc {
+
+! CFGConv-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+
+! CFGConv-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_allocatableEvar1"}
+
+! CFGConv-NEXT:   %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CFGConv-NEXT:   %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
+! CFGConv-NEXT:   %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> i64
+! CFGConv-NEXT:   %[[C0:.*]] = arith.constant 0 : i64
+! CFGConv-NEXT:   %[[ALLOC_COND:.*]] = arith.cmpi ne, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
+! CFGConv-NEXT:   cf.cond_br %[[ALLOC_COND]], ^[[ALLOC_MEM_BB:.*]], ^[[ZERO_MEM_BB:.*]]
+! CFGConv-NEXT: ^[[ALLOC_MEM_BB]]:
+! CFGConv-NEXT:   fir.allocmem
+! CFGConv:        cf.br ^[[DECL_BB:.*]]
+! CFGConv:      ^[[ZERO_MEM_BB]]:
+! CFGConv-NEXT:   fir.zero_bits
+! CFGConv:        cf.br ^[[DECL_BB:.*]]
+! CFGConv-NEXT: ^[[DECL_BB]]:
+! CFGConv-NEXT:   hlfir.declare
+! CFGConv-NEXT:   omp.yield
+
+
+! LLVMDialect-LABEL: omp.private {type = private}
+! LLVMDialect-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!llvm.ptr]] alloc {
+
+! LLVMDialect-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+! LLVMDialect:        llvm.alloca
+! LLVMDialect:        llvm.call @malloc
+
+! LLVMDialect-NOT:    hlfir.declare
diff --git a/flang/test/Lower/OpenMP/threadprivate-real-logical-complex-derivedtype.f90 b/flang/test/Lower/OpenMP/threadprivate-real-logical-complex-derivedtype.f90
index 55f806962a60..0a249ff101a0 100644
--- a/flang/test/Lower/OpenMP/threadprivate-real-logical-complex-derivedtype.f90
+++ b/flang/test/Lower/OpenMP/threadprivate-real-logical-complex-derivedtype.f90
@@ -21,6 +21,7 @@ module test
 !CHECK-DAG: fir.global @_QMtestEz : !fir.logical<4> {
 
 contains
+!CHECK-LABEL: func.func @_QMtestPsub
   subroutine sub()
 !CHECK-DAG:  %[[T:.*]] = fir.address_of(@_QMtestEt) : !fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>
 !CHECK-DAG:  %[[T_DECL:.*]]:2 = hlfir.declare %[[T]] {uniq_name = "_QMtestEt"} : (!fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>) -> (!fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>, !fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>)
diff --git a/flang/tools/flang-driver/CMakeLists.txt b/flang/tools/flang-driver/CMakeLists.txt
index 3ce8b407450d..ce30ecff028d 100644
--- a/flang/tools/flang-driver/CMakeLists.txt
+++ b/flang/tools/flang-driver/CMakeLists.txt
@@ -21,7 +21,6 @@ add_flang_tool(flang-new
   # unable to generate executables.
   FortranRuntime
   FortranDecimal
-  Fortran_main
 )
 
 target_link_libraries(flang-new
diff --git a/libcxx/include/__mutex/unique_lock.h b/libcxx/include/__mutex/unique_lock.h
index c27ce4b24c1a..4a616ba51ee1 100644
--- a/libcxx/include/__mutex/unique_lock.h
+++ b/libcxx/include/__mutex/unique_lock.h
@@ -36,26 +36,28 @@ private:
   bool __owns_;
 
 public:
-  _LIBCPP_HIDE_FROM_ABI unique_lock() _NOEXCEPT : __m_(nullptr), __owns_(false) {}
-  _LIBCPP_HIDE_FROM_ABI explicit unique_lock(mutex_type& __m) : __m_(std::addressof(__m)), __owns_(true) {
+  _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI unique_lock() _NOEXCEPT : __m_(nullptr), __owns_(false) {}
+  _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI explicit unique_lock(mutex_type& __m)
+      : __m_(std::addressof(__m)), __owns_(true) {
     __m_->lock();
   }
 
-  _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, defer_lock_t) _NOEXCEPT
+  _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, defer_lock_t) _NOEXCEPT
       : __m_(std::addressof(__m)),
         __owns_(false) {}
 
-  _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, try_to_lock_t)
+  _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, try_to_lock_t)
       : __m_(std::addressof(__m)), __owns_(__m.try_lock()) {}
 
-  _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, adopt_lock_t) : __m_(std::addressof(__m)), __owns_(true) {}
+  _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, adopt_lock_t)
+      : __m_(std::addressof(__m)), __owns_(true) {}
 
   template <class _Clock, class _Duration>
-  _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, const chrono::time_point<_Clock, _Duration>& __t)
+  _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, const chrono::time_point<_Clock, _Duration>& __t)
       : __m_(std::addressof(__m)), __owns_(__m.try_lock_until(__t)) {}
 
   template <class _Rep, class _Period>
-  _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, const chrono::duration<_Rep, _Period>& __d)
+  _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI unique_lock(mutex_type& __m, const chrono::duration<_Rep, _Period>& __d)
       : __m_(std::addressof(__m)), __owns_(__m.try_lock_for(__d)) {}
 
   _LIBCPP_HIDE_FROM_ABI ~unique_lock() {
@@ -66,7 +68,9 @@ public:
   unique_lock(unique_lock const&)            = delete;
   unique_lock& operator=(unique_lock const&) = delete;
 
-  _LIBCPP_HIDE_FROM_ABI unique_lock(unique_lock&& __u) _NOEXCEPT : __m_(__u.__m_), __owns_(__u.__owns_) {
+  _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI unique_lock(unique_lock&& __u) _NOEXCEPT
+      : __m_(__u.__m_),
+        __owns_(__u.__owns_) {
     __u.__m_    = nullptr;
     __u.__owns_ = false;
   }
diff --git a/libcxx/include/__string/constexpr_c_functions.h b/libcxx/include/__string/constexpr_c_functions.h
index 72c6ce69b60b..4da8542e3807 100644
--- a/libcxx/include/__string/constexpr_c_functions.h
+++ b/libcxx/include/__string/constexpr_c_functions.h
@@ -224,7 +224,7 @@ __constexpr_memmove(_Tp* __dest, _Up* __src, __element_count __n) {
         std::__assign_trivially_copyable(__dest[__i], __src[__i]);
     }
   } else if (__count > 0) {
-    ::__builtin_memmove(__dest, __src, (__count - 1) * sizeof(_Tp) + __libcpp_datasizeof<_Tp>::value);
+    ::__builtin_memmove(__dest, __src, (__count - 1) * sizeof(_Tp) + __datasizeof_v<_Tp>);
   }
   return __dest;
 }
diff --git a/libcxx/include/__type_traits/datasizeof.h b/libcxx/include/__type_traits/datasizeof.h
index 3a8b15160107..54fde242ebcd 100644
--- a/libcxx/include/__type_traits/datasizeof.h
+++ b/libcxx/include/__type_traits/datasizeof.h
@@ -26,39 +26,38 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _Tp>
-struct __libcpp_datasizeof {
 #if __has_extension(datasizeof)
-  static const size_t value = __datasizeof(_Tp);
+template <class _Tp>
+inline const size_t __datasizeof_v = __datasizeof(_Tp);
 #else
 // NOLINTNEXTLINE(readability-redundant-preprocessor) This is https://llvm.org/PR64825
 #  if __has_cpp_attribute(__no_unique_address__)
-  template <class = char>
-  struct _FirstPaddingByte {
-    [[__no_unique_address__]] _Tp __v_;
-    char __first_padding_byte_;
-  };
+template <class _Tp>
+struct _FirstPaddingByte {
+  [[__no_unique_address__]] _Tp __v_;
+  char __first_padding_byte_;
+};
 #  else
-  template <bool = __libcpp_is_final<_Tp>::value || !is_class<_Tp>::value>
-  struct _FirstPaddingByte : _Tp {
-    char __first_padding_byte_;
-  };
+template <class _Tp, bool = __libcpp_is_final<_Tp>::value || !is_class<_Tp>::value>
+struct _FirstPaddingByte : _Tp {
+  char __first_padding_byte_;
+};
 
-  template <>
-  struct _FirstPaddingByte<true> {
-    _Tp __v_;
-    char __first_padding_byte_;
-  };
+template <class _Tp>
+struct _FirstPaddingByte<_Tp, true> {
+  _Tp __v_;
+  char __first_padding_byte_;
+};
 #  endif // __has_cpp_attribute(__no_unique_address__)
 
-  // _FirstPaddingByte<> is sometimes non-standard layout. Using `offsetof` is UB in that case, but GCC and Clang allow
-  // the use as an extension.
-  _LIBCPP_DIAGNOSTIC_PUSH
-  _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-offsetof")
-  static const size_t value = offsetof(_FirstPaddingByte<>, __first_padding_byte_);
-  _LIBCPP_DIAGNOSTIC_POP
+// _FirstPaddingByte<> is sometimes non-standard layout. Using `offsetof` is UB in that case, but GCC and Clang allow
+// the use as an extension.
+_LIBCPP_DIAGNOSTIC_PUSH
+_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-offsetof")
+template <class _Tp>
+inline const size_t __datasizeof_v = offsetof(_FirstPaddingByte<_Tp>, __first_padding_byte_);
+_LIBCPP_DIAGNOSTIC_POP
 #endif   // __has_extension(datasizeof)
-};
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/mutex b/libcxx/include/mutex
index 12fae9a88b9d..0d2b5914bc4f 100644
--- a/libcxx/include/mutex
+++ b/libcxx/include/mutex
@@ -427,10 +427,10 @@ class _LIBCPP_TEMPLATE_VIS scoped_lock;
 template <>
 class _LIBCPP_TEMPLATE_VIS scoped_lock<> {
 public:
-  explicit scoped_lock() {}
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit scoped_lock() {}
   ~scoped_lock() = default;
 
-  _LIBCPP_HIDE_FROM_ABI explicit scoped_lock(adopt_lock_t) {}
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit scoped_lock(adopt_lock_t) {}
 
   scoped_lock(scoped_lock const&)            = delete;
   scoped_lock& operator=(scoped_lock const&) = delete;
@@ -445,13 +445,15 @@ private:
   mutex_type& __m_;
 
 public:
-  explicit scoped_lock(mutex_type& __m) _LIBCPP_THREAD_SAFETY_ANNOTATION(acquire_capability(__m)) : __m_(__m) {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit scoped_lock(mutex_type& __m)
+      _LIBCPP_THREAD_SAFETY_ANNOTATION(acquire_capability(__m))
+      : __m_(__m) {
     __m_.lock();
   }
 
   ~scoped_lock() _LIBCPP_THREAD_SAFETY_ANNOTATION(release_capability()) { __m_.unlock(); }
 
-  _LIBCPP_HIDE_FROM_ABI explicit scoped_lock(adopt_lock_t, mutex_type& __m)
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit scoped_lock(adopt_lock_t, mutex_type& __m)
       _LIBCPP_THREAD_SAFETY_ANNOTATION(requires_capability(__m))
       : __m_(__m) {}
 
@@ -465,9 +467,11 @@ class _LIBCPP_TEMPLATE_VIS scoped_lock {
   typedef tuple<_MArgs&...> _MutexTuple;
 
 public:
-  _LIBCPP_HIDE_FROM_ABI explicit scoped_lock(_MArgs&... __margs) : __t_(__margs...) { std::lock(__margs...); }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit scoped_lock(_MArgs&... __margs) : __t_(__margs...) {
+    std::lock(__margs...);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI scoped_lock(adopt_lock_t, _MArgs&... __margs) : __t_(__margs...) {}
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI scoped_lock(adopt_lock_t, _MArgs&... __margs) : __t_(__margs...) {}
 
   _LIBCPP_HIDE_FROM_ABI ~scoped_lock() {
     typedef typename __make_tuple_indices<sizeof...(_MArgs)>::type _Indices;
diff --git a/libcxx/test/libcxx/diagnostics/mutex.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/mutex.nodiscard.verify.cpp
index a98eb5f14211..b9890ced55bb 100644
--- a/libcxx/test/libcxx/diagnostics/mutex.nodiscard.verify.cpp
+++ b/libcxx/test/libcxx/diagnostics/mutex.nodiscard.verify.cpp
@@ -12,14 +12,58 @@
 
 // check that <mutex> functions are marked [[nodiscard]]
 
-// clang-format off
-
 #include <mutex>
+#include <chrono>
+#include <utility>
 
 #include "test_macros.h"
 
 void test() {
-  std::mutex mutex;
-  std::lock_guard<std::mutex>{mutex};                  // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
-  std::lock_guard<std::mutex>{mutex, std::adopt_lock}; // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+  // std::scoped_lock
+  {
+#if TEST_STD_VER >= 17
+    using M = std::mutex;
+    M m0, m1, m2;
+    // clang-format off
+    std::scoped_lock<>{};                                   // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::scoped_lock<M>{m0};                                // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::scoped_lock<M, M>{m0, m1};                         // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::scoped_lock<M, M, M>{m0, m1, m2};                  // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+
+    std::scoped_lock<>{std::adopt_lock};                    // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::scoped_lock<M>{std::adopt_lock, m0};               // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::scoped_lock<M, M>{std::adopt_lock, m0, m1};        // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::scoped_lock<M, M, M>{std::adopt_lock, m0, m1, m2}; // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    // clang-format on
+#endif
+  }
+
+  // std::unique_lock
+  {
+    using M = std::timed_mutex; // necessary for the time_point and duration constructors
+    M m;
+    std::chrono::time_point<std::chrono::steady_clock> time_point;
+    std::chrono::milliseconds duration;
+    std::unique_lock<M> other;
+
+    // clang-format off
+    std::unique_lock<M>{};                      // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::unique_lock<M>{m};                     // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::unique_lock<M>{m, std::defer_lock};    // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::unique_lock<M>{m, std::try_to_lock};   // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::unique_lock<M>{m, std::adopt_lock};    // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::unique_lock<M>{m, time_point};         // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::unique_lock<M>{m, duration};           // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::unique_lock<M>(std::move(other));      // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    // clang-format on
+  }
+
+  // std::lock_guard
+  {
+    std::mutex m;
+    // clang-format off
+    std::lock_guard<std::mutex>{m};                  // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    std::lock_guard<std::mutex>{m, std::adopt_lock}; // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
+    // clang-format on
+  }
 }
diff --git a/libcxx/test/libcxx/thread/thread.lock/thread.lock.guard/nodiscard.verify.cpp b/libcxx/test/libcxx/thread/thread.lock/thread.lock.guard/nodiscard.verify.cpp
deleted file mode 100644
index 5fe68c83b3d9..000000000000
--- a/libcxx/test/libcxx/thread/thread.lock/thread.lock.guard/nodiscard.verify.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: no-threads
-
-// [[nodiscard]] isn't supported in C++03 (not even as an extension)
-// UNSUPPORTED: c++03
-
-// <mutex>
-
-// template <class Mutex> class lock_guard;
-
-// [[nodiscard]] explicit lock_guard(mutex_type& m);
-// [[nodiscard]] lock_guard(mutex_type& m, adopt_lock_t);
-
-// Test that we properly apply [[nodiscard]] to lock_guard's constructors,
-// which is a libc++ extension.
-
-#include <mutex>
-
-void f() {
-    std::mutex m;
-    std::lock_guard<std::mutex>{m}; // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
-    std::lock_guard<std::mutex>{m, std::adopt_lock}; // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}}
-}
diff --git a/libcxx/test/libcxx/type_traits/datasizeof.compile.pass.cpp b/libcxx/test/libcxx/type_traits/datasizeof.compile.pass.cpp
index 881b0bd85190..03dd0f6eac53 100644
--- a/libcxx/test/libcxx/type_traits/datasizeof.compile.pass.cpp
+++ b/libcxx/test/libcxx/type_traits/datasizeof.compile.pass.cpp
@@ -9,10 +9,10 @@
 #include <__type_traits/datasizeof.h>
 #include <cstdint>
 
-static_assert(std::__libcpp_datasizeof<std::int8_t>::value == 1, "");
-static_assert(std::__libcpp_datasizeof<std::int16_t>::value == 2, "");
-static_assert(std::__libcpp_datasizeof<std::int32_t>::value == 4, "");
-static_assert(std::__libcpp_datasizeof<std::int64_t>::value == 8, "");
+static_assert(std::__datasizeof_v<std::int8_t> == 1, "");
+static_assert(std::__datasizeof_v<std::int16_t> == 2, "");
+static_assert(std::__datasizeof_v<std::int32_t> == 4, "");
+static_assert(std::__datasizeof_v<std::int64_t> == 8, "");
 
 struct OneBytePadding {
   OneBytePadding() {}
@@ -22,9 +22,9 @@ struct OneBytePadding {
 };
 
 #if defined(_WIN32) && !defined(__MINGW32__)
-static_assert(std::__libcpp_datasizeof<OneBytePadding>::value == 4, "");
+static_assert(std::__datasizeof_v<OneBytePadding> == 4, "");
 #else
-static_assert(std::__libcpp_datasizeof<OneBytePadding>::value == 3, "");
+static_assert(std::__datasizeof_v<OneBytePadding> == 3, "");
 #endif
 
 struct InBetweenPadding {
@@ -35,4 +35,4 @@ struct InBetweenPadding {
   std::int16_t c;
 };
 
-static_assert(std::__libcpp_datasizeof<InBetweenPadding>::value == 8, "");
+static_assert(std::__datasizeof_v<InBetweenPadding> == 8, "");
diff --git a/libcxx/test/libcxx/utilities/expected/expected.expected/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/utilities/expected/expected.expected/no_unique_address.compile.pass.cpp
index cf1909b92873..580c0f4ae10c 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.expected/no_unique_address.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.expected/no_unique_address.compile.pass.cpp
@@ -47,28 +47,28 @@ static_assert(sizeof(std::expected<B, B>) == sizeof(B));
 
 // Check that `expected`'s datasize is large enough for the parameter type(s).
 static_assert(sizeof(std::expected<BoolWithPadding, Empty>) ==
-              std::__libcpp_datasizeof<std::expected<BoolWithPadding, Empty>>::value);
+              std::__datasizeof_v<std::expected<BoolWithPadding, Empty>>);
 static_assert(sizeof(std::expected<Empty, BoolWithPadding>) ==
-              std::__libcpp_datasizeof<std::expected<Empty, BoolWithPadding>>::value);
+              std::__datasizeof_v<std::expected<Empty, BoolWithPadding>>);
 
 // In this case, there should be tail padding in the `expected` because `A`
 // itself does _not_ have tail padding.
-static_assert(sizeof(std::expected<A, A>) > std::__libcpp_datasizeof<std::expected<A, A>>::value);
+static_assert(sizeof(std::expected<A, A>) > std::__datasizeof_v<std::expected<A, A>>);
 
 // Test with some real types.
 static_assert(sizeof(std::expected<std::optional<int>, int>) == 8);
-static_assert(std::__libcpp_datasizeof<std::expected<std::optional<int>, int>>::value == 8);
+static_assert(std::__datasizeof_v<std::expected<std::optional<int>, int>> == 8);
 
 static_assert(sizeof(std::expected<int, std::optional<int>>) == 8);
-static_assert(std::__libcpp_datasizeof<std::expected<int, std::optional<int>>>::value == 8);
+static_assert(std::__datasizeof_v<std::expected<int, std::optional<int>>> == 8);
 
 static_assert(sizeof(std::expected<int, int>) == 8);
-static_assert(std::__libcpp_datasizeof<std::expected<int, int>>::value == 5);
+static_assert(std::__datasizeof_v<std::expected<int, int>> == 5);
 
 // clang-format off
-static_assert(std::__libcpp_datasizeof<int>::value == 4);
-static_assert(std::__libcpp_datasizeof<std::expected<int, int>>::value == 5);
-static_assert(std::__libcpp_datasizeof<std::expected<std::expected<int, int>, int>>::value == 8);
-static_assert(std::__libcpp_datasizeof<std::expected<std::expected<std::expected<int, int>, int>, int>>::value == 9);
-static_assert(std::__libcpp_datasizeof<std::expected<std::expected<std::expected<std::expected<int, int>, int>, int>, int>>::value == 12);
+static_assert(std::__datasizeof_v<int> == 4);
+static_assert(std::__datasizeof_v<std::expected<int, int>> == 5);
+static_assert(std::__datasizeof_v<std::expected<std::expected<int, int>, int>> == 8);
+static_assert(std::__datasizeof_v<std::expected<std::expected<std::expected<int, int>, int>, int>> == 9);
+static_assert(std::__datasizeof_v<std::expected<std::expected<std::expected<std::expected<int, int>, int>, int>, int>> == 12);
 // clang-format on
diff --git a/libcxx/test/libcxx/utilities/expected/expected.void/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/utilities/expected/expected.void/no_unique_address.compile.pass.cpp
index fdee8b71e5d9..27da03c54ac4 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.void/no_unique_address.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.void/no_unique_address.compile.pass.cpp
@@ -45,23 +45,23 @@ static_assert(sizeof(std::expected<void, B>) == sizeof(B));
 
 // Check that `expected`'s datasize is large enough for the parameter type(s).
 static_assert(sizeof(std::expected<void, BoolWithPadding>) ==
-              std::__libcpp_datasizeof<std::expected<void, BoolWithPadding>>::value);
+              std::__datasizeof_v<std::expected<void, BoolWithPadding>>);
 
 // In this case, there should be tail padding in the `expected` because `A`
 // itself does _not_ have tail padding.
-static_assert(sizeof(std::expected<void, A>) > std::__libcpp_datasizeof<std::expected<void, A>>::value);
+static_assert(sizeof(std::expected<void, A>) > std::__datasizeof_v<std::expected<void, A>>);
 
 // Test with some real types.
 static_assert(sizeof(std::expected<void, std::optional<int>>) == 8);
-static_assert(std::__libcpp_datasizeof<std::expected<void, std::optional<int>>>::value == 8);
+static_assert(std::__datasizeof_v<std::expected<void, std::optional<int>>> == 8);
 
 static_assert(sizeof(std::expected<void, int>) == 8);
-static_assert(std::__libcpp_datasizeof<std::expected<void, int>>::value == 5);
+static_assert(std::__datasizeof_v<std::expected<void, int>> == 5);
 
 // clang-format off
-static_assert(std::__libcpp_datasizeof<int>::value == 4);
-static_assert(std::__libcpp_datasizeof<std::expected<void, int>>::value == 5);
-static_assert(std::__libcpp_datasizeof<std::expected<void, std::expected<void, int>>>::value == 8);
-static_assert(std::__libcpp_datasizeof<std::expected<void, std::expected<void, std::expected<void, int>>>>::value == 9);
-static_assert(std::__libcpp_datasizeof<std::expected<void, std::expected<void, std::expected<void, std::expected<void, int>>>>>::value == 12);
+static_assert(std::__datasizeof_v<int> == 4);
+static_assert(std::__datasizeof_v<std::expected<void, int>> == 5);
+static_assert(std::__datasizeof_v<std::expected<void, std::expected<void, int>>> == 8);
+static_assert(std::__datasizeof_v<std::expected<void, std::expected<void, std::expected<void, int>>>> == 9);
+static_assert(std::__datasizeof_v<std::expected<void, std::expected<void, std::expected<void, std::expected<void, int>>>>> == 12);
 // clang-format on
diff --git a/libcxx/test/std/containers/sequences/array/size_and_alignment.compile.pass.cpp b/libcxx/test/std/containers/sequences/array/size_and_alignment.compile.pass.cpp
index 209e24964807..7ba56577d1bb 100644
--- a/libcxx/test/std/containers/sequences/array/size_and_alignment.compile.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/size_and_alignment.compile.pass.cpp
@@ -46,7 +46,7 @@ void test_type() {
     static_assert(!std::is_empty<Array>::value, "");
 
     // Make sure empty arrays don't have padding bytes
-    LIBCPP_STATIC_ASSERT(std::__libcpp_datasizeof<Array>::value == sizeof(Array), "");
+    LIBCPP_STATIC_ASSERT(std::__datasizeof_v<Array> == sizeof(Array), "");
   }
 
   {
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.indirect.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.indirect.pass.cpp
index 0b540e09bab3..3d9856e0b1ba 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.indirect.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.indirect.pass.cpp
@@ -51,11 +51,12 @@ int main(int, char**) {
     // Test with an overaligned type
     {
         new_called = delete_called = 0;
-        OverAligned* x = DoNotOptimize(new OverAligned[3]);
+        OverAligned* dummy_data_block = new OverAligned[3];
+        OverAligned* x                = DoNotOptimize(dummy_data_block);
         ASSERT_WITH_OPERATOR_NEW_FALLBACKS(static_cast<void*>(x) == DummyData);
         ASSERT_WITH_OPERATOR_NEW_FALLBACKS(new_called == 1);
 
-        delete[] x;
+        delete[] dummy_data_block;
         ASSERT_WITH_OPERATOR_NEW_FALLBACKS(delete_called == 1);
     }
 
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.pass.cpp
index 2d021ecb30e7..73d1054df188 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.pass.cpp
@@ -48,11 +48,12 @@ int main(int, char**) {
     // Test with an overaligned type
     {
         new_called = delete_called = 0;
-        OverAligned* x = new OverAligned[3];
+        OverAligned* dummy_data_block = new OverAligned[3];
+        OverAligned* x                = DoNotOptimize(dummy_data_block);
         assert(static_cast<void*>(x) == DummyData);
         assert(new_called == 1);
 
-        delete[] x;
+        delete[] dummy_data_block;
         assert(delete_called == 1);
     }
 
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.indirect.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.indirect.pass.cpp
index 227b20f0b1e1..c9dc20a34b13 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.indirect.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.indirect.pass.cpp
@@ -55,11 +55,12 @@ int main(int, char**) {
     // Test with an overaligned type
     {
         new_called = delete_called = 0;
-        OverAligned* x = DoNotOptimize(new (std::nothrow) OverAligned[3]);
+        OverAligned* dummy_data_block = new (std::nothrow) OverAligned[3];
+        OverAligned* x                = DoNotOptimize(dummy_data_block);
         ASSERT_WITH_OPERATOR_NEW_FALLBACKS(static_cast<void*>(x) == DummyData);
         ASSERT_WITH_OPERATOR_NEW_FALLBACKS(new_called == 1);
 
-        delete[] x;
+        delete[] dummy_data_block;
         ASSERT_WITH_OPERATOR_NEW_FALLBACKS(delete_called == 1);
     }
 
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.replace.pass.cpp
index e5ef5f166975..f9e339b22190 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.replace.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.replace.pass.cpp
@@ -48,11 +48,12 @@ int main(int, char**) {
     // Test with an overaligned type
     {
         new_called = delete_called = 0;
-        OverAligned* x = new OverAligned;
+        OverAligned* dummy_data_block = new OverAligned;
+        OverAligned* x                = DoNotOptimize(dummy_data_block);
         assert(static_cast<void*>(x) == DummyData);
         assert(new_called == 1);
 
-        delete x;
+        delete dummy_data_block;
         assert(delete_called == 1);
     }
 
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.indirect.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.indirect.pass.cpp
index 7eab0729f9ef..dedd3089f5ab 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.indirect.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.indirect.pass.cpp
@@ -54,11 +54,12 @@ int main(int, char**) {
     // Test with an overaligned type
     {
         new_called = delete_called = 0;
-        OverAligned* x = DoNotOptimize(new (std::nothrow) OverAligned);
+        OverAligned* dummy_data_block = new (std::nothrow) OverAligned;
+        OverAligned* x                = DoNotOptimize(dummy_data_block);
         ASSERT_WITH_OPERATOR_NEW_FALLBACKS(static_cast<void*>(x) == DummyData);
         ASSERT_WITH_OPERATOR_NEW_FALLBACKS(new_called == 1);
 
-        delete x;
+        delete dummy_data_block;
         ASSERT_WITH_OPERATOR_NEW_FALLBACKS(delete_called == 1);
     }
 
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.pass.cpp
index 9a5b53e03902..a25b67ea554b 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.pass.cpp
@@ -47,11 +47,12 @@ int main(int, char**) {
     // Test with an overaligned type
     {
         new_nothrow_called = delete_called = 0;
-        OverAligned* x = new (std::nothrow) OverAligned;
+        OverAligned* dummy_data_block      = new (std::nothrow) OverAligned;
+        OverAligned* x                     = DoNotOptimize(dummy_data_block);
         assert(static_cast<void*>(x) == DummyData);
         ASSERT_WITH_OPERATOR_NEW_FALLBACKS(new_nothrow_called == 1);
 
-        delete x;
+        delete dummy_data_block;
         ASSERT_WITH_OPERATOR_NEW_FALLBACKS(delete_called == 1);
     }
 
diff --git a/lld/COFF/MinGW.cpp b/lld/COFF/MinGW.cpp
index e46f5277a8c3..29c01da9e28f 100644
--- a/lld/COFF/MinGW.cpp
+++ b/lld/COFF/MinGW.cpp
@@ -50,7 +50,6 @@ AutoExporter::AutoExporter(
       "libclang_rt.profile-x86_64",
       "libc++",
       "libc++abi",
-      "libFortran_main",
       "libFortranRuntime",
       "libFortranDecimal",
       "libunwind",
diff --git a/lld/tools/lld/CMakeLists.txt b/lld/tools/lld/CMakeLists.txt
index 0f5daa78846b..8498a91597a9 100644
--- a/lld/tools/lld/CMakeLists.txt
+++ b/lld/tools/lld/CMakeLists.txt
@@ -42,3 +42,7 @@ endif()
 foreach(link ${LLD_SYMLINKS_TO_CREATE})
   add_lld_symlink(${link} lld)
 endforeach()
+
+if(LLVM_TOOL_LLVM_DRIVER_BUILD)
+  set_property(GLOBAL APPEND PROPERTY LLVM_DRIVER_HIDDEN_TOOL_ALIASES_lld ld)
+endif()
diff --git a/lldb/docs/resources/lldbgdbremote.md b/lldb/docs/resources/lldbgdbremote.md
index 05e4f714f7f7..a9fa2a432b70 100644
--- a/lldb/docs/resources/lldbgdbremote.md
+++ b/lldb/docs/resources/lldbgdbremote.md
@@ -27,18 +27,8 @@ standard GDB remote protocol packets.
 
 ## QStartNoAckMode
 
-### Brief
-
 Try to enable no ACK mode to skip sending ACKs and NACKs.
 
-### Priority To Implement
-
-High. Any GDB remote server that can implement this should if the
-connection is reliable. This improves packet throughput and increases
-the performance of the connection.
-
-### Description
-
 Having to send an ACK/NACK after every packet slows things down a bit, so we
 have a way to disable ACK packets to minimize the traffic for reliable
 communication interfaces (like sockets). Below GDB or LLDB will send this
@@ -52,17 +42,15 @@ read packet: $OK#9a
 send packet: +
 ```
 
-## QSupported
-
-### Brief
-
-Query the GDB remote server for features it supports
-
 ### Priority To Implement
 
-Optional.
+High. Any GDB remote server that can implement this should if the
+connection is reliable. This improves packet throughput and increases
+the performance of the connection.
 
-### Description
+## QSupported
+
+Query the GDB remote server for features it supports
 
 QSupported is a standard GDB Remote Serial Protocol packet, but
 there are several additions to the response that lldb can parse.
@@ -96,21 +84,14 @@ In the example above, three lldb extensions are shown:
       watchpoints, up to a pointer size, `sizeof(void*)`, a reasonable
       baseline assumption.
 
+### Priority To Implement
 
-## "A" - launch args packet
+Optional.
 
-### Brief
+## "A" - launch args packet
 
 Launch a program using the supplied arguments
 
-### Priority To Implement
-
-Low. Only needed if the remote target wants to launch a target after
-making a connection to a GDB server that isn't already connected to
-an inferior process.
-
-### Description
-
 We have added support for the "set program arguments" packet where we can
 start a connection to a remote server and then later supply the path to the
 executable and the arguments to use when executing:
@@ -130,14 +111,16 @@ The above packet helps when you have remote debugging abilities where you
 could launch a process on a remote host, this isn't needed for bare board
 debugging.
 
-## qLaunchSuccess
+### Priority To Implement
+
+Low. Only needed if the remote target wants to launch a target after
+making a connection to a GDB server that isn't already connected to
+an inferior process.
 
-### Brief
+## qLaunchSuccess
 
 Check whether launching a process with the `A` packet succeeded.
 
-### Description
-
 Returns the status of the last attempt to launch a process.
 Either `OK` if no error ocurred, or `E` followed by a string
 describing the error.
@@ -148,8 +131,6 @@ High, launching processes is a key part of LLDB's platform mode.
 
 ## QEnvironment:NAME=VALUE
 
-### Brief
-
 Setup the environment up for a new child process that will soon be
 launched using the "A" packet.
 
@@ -161,14 +142,6 @@ scan the environment strings before sending, prefer
 the `QEnvironmentHexEncoded` packet over `QEnvironment`, if it is
 available.
 
-### Priority To Implement
-
-Low. Only needed if the remote target wants to launch a target after
-making a connection to a GDB server that isn't already connected to
-an inferior process.
-
-### Description
-
 Both GDB and LLDB support passing down environment variables. Is it ok to
 respond with a `$#00` (unimplemented):
 ```
@@ -177,9 +150,13 @@ read packet: $OK#00
 ```
 This packet can be sent one or more times _prior_ to sending a "A" packet.
 
-## QEnvironmentHexEncoded:HEX-ENCODING(NAME=VALUE)
+### Priority To Implement
+
+Low. Only needed if the remote target wants to launch a target after
+making a connection to a GDB server that isn't already connected to
+an inferior process.
 
-### Brief
+## QEnvironmentHexEncoded:HEX-ENCODING(NAME=VALUE)
 
 Setup the environment up for a new child process that will soon be
 launched using the "A" packet.
@@ -188,14 +165,6 @@ The only difference between this packet and `QEnvironment` is that the
 environment key-value pair is ascii hex encoded for transmission.
 This allows values with gdb-remote metacharacters like `#` to be sent.
 
-### Priority To Implement
-
-Low. Only needed if the remote target wants to launch a target after
-making a connection to a GDB server that isn't already connected to
-an inferior process.
-
-### Description
-
 Both GDB and LLDB support passing down environment variables. Is it ok to
 respond with a `$#00` (unimplemented):
 ```
@@ -204,9 +173,13 @@ read packet: $OK#00
 ```
 This packet can be sent one or more times _prior_ to sending a "A" packet.
 
-## QEnableErrorStrings
+### Priority To Implement
 
-### Brief
+Low. Only needed if the remote target wants to launch a target after
+making a connection to a GDB server that isn't already connected to
+an inferior process.
+
+## QEnableErrorStrings
 
 This packet enables reporting of Error strings in remote packet
 replies from the server to client. If the server supports this
@@ -236,19 +209,9 @@ read packet: $OK#00
 
 ## QSetSTDIN:\<ascii-hex-path\> / QSetSTDOUT:\<ascii-hex-path\> / QSetSTDERR:\<ascii-hex-path\>
 
-### Brief
-
 Setup where STDIN, STDOUT, and STDERR go prior to sending an "A"
 packet.
 
-### Priority To Implement
-
-Low. Only needed if the remote target wants to launch a target after
-making a connection to a GDB server that isn't already connected to
-an inferior process.
-
-### Description
-
 When launching a program through the GDB remote protocol with the "A" packet,
 you might also want to specify where stdin/out/err go:
 ```
@@ -258,19 +221,16 @@ QSetSTDERR:<ascii-hex-path>
 ```
 These packets must be sent  _prior_ to sending a "A" packet.
 
-## QSetWorkingDir:\<ascii-hex-path\>
-
-### Brief
-
-Set the working directory prior to sending an "A" packet.
-
 ### Priority To Implement
 
 Low. Only needed if the remote target wants to launch a target after
 making a connection to a GDB server that isn't already connected to
 an inferior process.
 
-### Description
+
+## QSetWorkingDir:\<ascii-hex-path\>
+
+Set the working directory prior to sending an "A" packet.
 
 Or specify the working directory:
 ```
@@ -278,9 +238,13 @@ QSetWorkingDir:<ascii-hex-path>
 ```
 This packet must be sent  _prior_ to sending a "A" packet.
 
-## qGetWorkingDir
+### Priority To Implement
 
-### Brief
+Low. Only needed if the remote target wants to launch a target after
+making a connection to a GDB server that isn't already connected to
+an inferior process.
+
+## qGetWorkingDir
 
 Get the current working directory of the platform stub in
 ASCII hex encoding.
@@ -294,19 +258,8 @@ send:    2f4170706c65496e7465726e616c2f6c6c64622f73657474696e67732f342f546573745
 
 ## QSetDisableASLR:\<bool\>
 
-### Brief
-
 Enable or disable ASLR on the next "A" packet.
 
-### Priority To Implement
-
-Low. Only needed if the remote target wants to launch a target after
-making a connection to a GDB server that isn't already connected to
-an inferior process and if the target supports disabling ASLR
-(Address space layout randomization).
-
-### Description
-
 Or control if ASLR is enabled/disabled:
 ```
 send packet: QSetDisableASLR:1
@@ -317,9 +270,14 @@ read packet: OK
 ```
 This packet must be sent  _prior_ to sending a "A" packet.
 
-## QListThreadsInStopReply
+### Priority To Implement
 
-### Brief
+Low. Only needed if the remote target wants to launch a target after
+making a connection to a GDB server that isn't already connected to
+an inferior process and if the target supports disabling ASLR
+(Address space layout randomization).
+
+## QListThreadsInStopReply
 
 Enable the `threads:` and `thread-pcs:` data in the question-mark packet
 ("T packet") responses when the stub reports that a program has
@@ -341,8 +299,6 @@ read packet: OK
 
 ## jLLDBTraceSupported
 
-### Brief
-
 Get the processor tracing type supported by the gdb-server for the current
 inferior. Responses might be different depending on the architecture and
 capabilities of the underlying OS.
@@ -376,8 +332,6 @@ read packet: {"name":<name>, "description":<description>}/E<error code>;AAAAAAAA
 
 ## jLLDBTraceStart
 
-### Brief
-
 Start tracing a process or its threads using a provided tracing technology.
 The input and output are specified as JSON objects. In case of success, an OK
 response is returned, or an error otherwise.
@@ -530,8 +484,6 @@ read packet: OK/E<error code>;AAAAAAAAA
 
 ## jLLDBTraceStop
 
-### Brief
-
 Stop tracing a process or its threads using a provided tracing technology.
 The input and output are specified as JSON objects. In case of success, an OK
 response is returned, or an error otherwise.
@@ -583,8 +535,6 @@ read packet: OK/E<error code>;AAAAAAAAA
 
 ## jLLDBTraceGetState
 
-### Brief
-
 Get the current state of the process and its threads being traced by
 a given trace technology. The response is a JSON object with custom
 information depending on the trace technology. In case of errors, an
@@ -690,8 +640,6 @@ read packet: {...object}/E<error code>;AAAAAAAAA
 
 ## jLLDBTraceGetBinaryData
 
-### Brief
-
 Get binary data given a trace technology and a data identifier.
 The input is specified as a JSON object and the response has the same format
 as the "binary memory read" (aka "x") packet. In case of failures, an error
@@ -722,29 +670,8 @@ read packet: <binary data>/E<error code>;AAAAAAAAA
 
 ## qRegisterInfo\<hex-reg-id\>
 
-### Brief
-
 Discover register information from the remote GDB server.
 
-### Priority To Implement
-
-High. Any target that can self describe its registers, should do so.
-This means if new registers are ever added to a remote target, they
-will get picked up automatically, and allows registers to change
-depending on the actual CPU type that is used.
-
-NB: `qRegisterInfo` is deprecated in favor of the standard gdb remote
-serial protocol register description method,
-`qXfer:features:read:target.xml`.
-If `qXfer:features:read:target.xml` is supported, `qRegisterInfo` does
-not need to be implemented.  The target.xml format is used by most
-gdb RSP stubs whereas `qRegisterInfo` was an lldb-only design.
-`qRegisterInfo` requires one packet per register and can have undesirable
-performance costs at the start of a debug session, whereas target.xml
-may be able to describe all registers in a single packet.
-
-### Description
-
 With LLDB, for register information, remote GDB servers can add
 support for the "qRegisterInfoN" packet where "N" is a zero based
 base 16 register number that must start at zero and increase by one
@@ -1010,19 +937,26 @@ The keys and values are detailed below:
   modifying the CPSR register can cause the r8 - r14 and cpsr value to
   change depending on if the mode has changed.
 
+### Priority To Implement
 
-## qPlatform_shell
-
-### Brief
-
-Run a command in a shell on the connected remote machine.
+High. Any target that can self describe its registers, should do so.
+This means if new registers are ever added to a remote target, they
+will get picked up automatically, and allows registers to change
+depending on the actual CPU type that is used.
 
-### Priority To Implement
+NB: `qRegisterInfo` is deprecated in favor of the standard gdb remote
+serial protocol register description method,
+`qXfer:features:read:target.xml`.
+If `qXfer:features:read:target.xml` is supported, `qRegisterInfo` does
+not need to be implemented.  The target.xml format is used by most
+gdb RSP stubs whereas `qRegisterInfo` was an lldb-only design.
+`qRegisterInfo` requires one packet per register and can have undesirable
+performance costs at the start of a debug session, whereas target.xml
+may be able to describe all registers in a single packet.
 
-High. This command allows LLDB clients to run arbitrary shell
-commands on a remote host.
+## qPlatform_shell
 
-### Description
+Run a command in a shell on the connected remote machine.
 
 The request consists of the command to be executed encoded in ASCII characters
 converted into hex bytes.
@@ -1043,18 +977,14 @@ drwxrwxr-x  5 username groupname    4096 Aug 15 21:36 source.cpp
 -rw-r--r--  1 username groupname    3190 Aug 12 16:46 Makefile
 ```
 
-## qPlatform_mkdir
-
-### Brief
-
-Creates a new directory on the connected remote machine.
-
 ### Priority To Implement
 
-Low. This command allows LLDB clients to create new directories on
-a remote host.
+High. This command allows LLDB clients to run arbitrary shell
+commands on a remote host.
+
+## qPlatform_mkdir
 
-### Description
+Creates a new directory on the connected remote machine.
 
 Request: `qPlatform_mkdir:<hex-file-mode>,<ascii-hex-path>`
 
@@ -1067,20 +997,15 @@ Reply:
     (mkdir called successfully and returned with the given return code)
   * `Exx` (An error occurred)
 
+### Priority To Implement
 
-## vFile:chmod / qPlatform_chmod
+Low. This command allows LLDB clients to create new directories on
+a remote host.
 
-### Brief
+## vFile:chmod / qPlatform_chmod
 
 Change the permissions of a file on the connected remote machine.
 
-### Priority To Implement
-
-Low. This command allows LLDB clients to change the permissions of
-a file on the remote host.
-
-### Description
-
 Request: `qPlatform_chmod:<hex-file-mode>,<ascii-hex-path>`
 
 Reply:
@@ -1088,19 +1013,13 @@ Reply:
   (chmod called successfully and returned with the given return code)
 * `Exx` (An error occurred)
 
-## qHostInfo
-
-### Brief
-
-Get information about the host we are remotely connected to.
-
 ### Priority To Implement
 
-High. This packet is usually very easy to implement and can help
-LLDB select the correct plug-ins for the job based on the target
-triple information that is supplied.
+Low.
 
-### Description
+## qHostInfo
+
+Get information about the host we are remotely connected to.
 
 LLDB supports a host info call that gets all sorts of details of the system
 that is being debugged:
@@ -1147,20 +1066,16 @@ Key value pairs are one of:
   AArch64 can have different page table setups for low and high
   memory, and therefore a different number of bits used for addressing.
 
-## qGDBServerVersion
-
-### Brief
-
-Get version information about this implementation of the gdb-remote
-protocol.
-
 ### Priority To Implement
 
 High. This packet is usually very easy to implement and can help
-LLDB to work around bugs in a server's implementation when they
-are found.
+LLDB select the correct plug-ins for the job based on the target
+triple information that is supplied.
+
+## qGDBServerVersion
 
-### Description
+Get version information about this implementation of the gdb-remote
+protocol.
 
 The goal of this packet is to provide enough information about an
 implementation of the gdb-remote-protocol server that lldb can
@@ -1192,9 +1107,13 @@ Suggested key names:
 * `major_version`: major version number
 * `minor_version`: minor version number
 
-## qProcessInfo
+### Priority To Implement
 
-### Brief
+High. This packet is usually very easy to implement and can help
+LLDB to work around bugs in a server's implementation when they
+are found.
+
+## qProcessInfo
 
 Get information about the process we are currently debugging.
 
@@ -1211,8 +1130,6 @@ process to know what you're working with.
 
 All numeric fields return base 16 numbers without any "0x" prefix.
 
-### Description
-
 An i386 process:
 ```
 send packet: $qProcessInfo#00
@@ -1249,24 +1166,9 @@ Key value pairs include:
 
 ## qShlibInfoAddr
 
-### Brief
-
 Get an address where the dynamic linker stores information about
 where shared libraries are loaded.
 
-### Priority To Implement
-
-High if you have a dynamic loader plug-in in LLDB for your target
-triple (see the "qHostInfo" packet) that can use this information.
-Many times address load randomization can make it hard to detect
-where the dynamic loader binary and data structures are located and
-some platforms know, or can find out where this information is.
-
-Low if you have a debug target where all object and symbol files
-contain static load addresses.
-
-### Description
-
 LLDB and GDB both support the `qShlibInfoAddr` packet which is a hint to each
 debugger as to where to find the dynamic loader information. For darwin
 binaries that run in user land this is the address of the `all_image_infos`
@@ -1278,12 +1180,29 @@ send packet: $qShlibInfoAddr#00
 read packet: $7fff5fc40040#00
 ```
 
-## qThreadStopInfo\<tid\>
+### Priority To Implement
 
-### Brief
+High if you have a dynamic loader plug-in in LLDB for your target
+triple (see the "qHostInfo" packet) that can use this information.
+Many times address load randomization can make it hard to detect
+where the dynamic loader binary and data structures are located and
+some platforms know, or can find out where this information is.
+
+Low if you have a debug target where all object and symbol files
+contain static load addresses.
+
+## qThreadStopInfo\<tid\>
 
 Get information about why a thread, whose ID is `<tid>`, is stopped.
 
+LLDB tries to use the `qThreadStopInfo` packet which is formatted as
+`qThreadStopInfo%x` where `%x` is the hex thread ID. This requests information
+about why a thread is stopped. The response is the same as the stop reply
+packets and tells us what happened to the other threads. The standard GDB
+remote packets love to think that there is only _one_ reason that _one_ thread
+stops at a time. This allows us to see why all threads stopped and allows us
+to implement better multi-threaded debugging support.
+
 ### Priority To Implement
 
 High if you need to support multi-threaded or multi-core debugging.
@@ -1294,34 +1213,10 @@ threads (live system debug) / cores (JTAG) in your program have
 stopped and allows LLDB to display and control your program
 correctly.
 
-### Description
-
-LLDB tries to use the `qThreadStopInfo` packet which is formatted as
-`qThreadStopInfo%x` where `%x` is the hex thread ID. This requests information
-about why a thread is stopped. The response is the same as the stop reply
-packets and tells us what happened to the other threads. The standard GDB
-remote packets love to think that there is only _one_ reason that _one_ thread
-stops at a time. This allows us to see why all threads stopped and allows us
-to implement better multi-threaded debugging support.
-
 ## QThreadSuffixSupported
 
-### Brief
-
 Try to enable thread suffix support for the `g`, `G`, `p`, and `P` packets.
 
-### Priority To Implement
-
-High. Adding a thread suffix allows us to read and write registers
-more efficiently and stops us from having to select a thread with
-one packet and then read registers with a second packet. It also
-makes sure that no errors can occur where the debugger thinks it
-already has a thread selected (see the `Hg` packet from the standard
-GDB remote protocol documentation) yet the remote GDB server actually
-has another thread selected.
-
-### Description
-
 When reading thread registers, you currently need to set the current
 thread, then read the registers. This is kind of cumbersome, so we added the
 ability to query if the remote GDB server supports adding a `thread:<tid>;`
@@ -1359,21 +1254,20 @@ read packet: ....
 We also added support for allocating and deallocating memory. We use this to
 allocate memory so we can run JITed code.
 
-## _M\<size\>,\<permissions\>
-
-### Brief
-
-Allocate memory on the remote target with the specified size and
-permissions.
-
 ### Priority To Implement
 
-High if you want LLDB to be able to JIT code and run that code. JIT
-code also needs data which is also allocated and tracked.
+High. Adding a thread suffix allows us to read and write registers
+more efficiently and stops us from having to select a thread with
+one packet and then read registers with a second packet. It also
+makes sure that no errors can occur where the debugger thinks it
+already has a thread selected (see the `Hg` packet from the standard
+GDB remote protocol documentation) yet the remote GDB server actually
+has another thread selected.
 
-Low if you don't support running JIT'ed code.
+## _M\<size\>,\<permissions\>
 
-### Description
+Allocate memory on the remote target with the specified size and
+permissions.
 
 The allocate memory packet starts with `_M<size>,<permissions>`. It returns a
 raw big endian address value, or an empty response for unimplemented, or `EXX` for an error
@@ -1395,13 +1289,6 @@ You request a size and give the permissions. This packet does NOT need to be
 implemented if you don't want to support running JITed code. The return value
 is just the address of the newly allocated memory as raw big endian hex bytes.
 
-## _m\<addr\>
-
-### Brief
-
-Deallocate memory that was previously allocated using an allocate
-memory pack.
-
 ### Priority To Implement
 
 High if you want LLDB to be able to JIT code and run that code. JIT
@@ -1409,30 +1296,26 @@ code also needs data which is also allocated and tracked.
 
 Low if you don't support running JIT'ed code.
 
-### Description
+## _m\<addr\>
+
+Deallocate memory that was previously allocated using an allocate
+memory pack.
 
 The deallocate memory packet is `_m<addr>` where you pass in the address you
 got back from a previous call to the allocate memory packet. It returns `OK`
 if the memory was successfully deallocated, or `EXX`" for an error, or an
 empty response if not supported.
 
-## qMemoryRegionInfo:\<addr\>
-
-### Brief
+### Priority To Implement
 
-Get information about the address range that contains `<addr>`.
+High if you want LLDB to be able to JIT code and run that code. JIT
+code also needs data which is also allocated and tracked.
 
-### Priority To Implement
+Low if you don't support running JIT'ed code.
 
-Medium. This is nice to have, but it isn't necessary. It helps LLDB
-do stack unwinding when we branch into memory that isn't executable.
-If we can detect that the code we are stopped in isn't executable,
-then we can recover registers for stack frames above the current
-frame. Otherwise we must assume we are in some JIT'ed code (not JIT
-code that LLDB has made) and assume that no registers are available
-in higher stack frames.
+## qMemoryRegionInfo:\<addr\>
 
-### Description
+Get information about the address range that contains `<addr>`.
 
 We added a way to get information for a memory region. The packet is:
 ```
@@ -1488,9 +1371,17 @@ For instance, with a macOS process which has nothing mapped in the first
 The lack of `permissions:` indicates that none of read/write/execute are valid
 for this region.
 
-## "x" - Binary memory read
+### Priority To Implement
+
+Medium. This is nice to have, but it isn't necessary. It helps LLDB
+do stack unwinding when we branch into memory that isn't executable.
+If we can detect that the code we are stopped in isn't executable,
+then we can recover registers for stack frames above the current
+frame. Otherwise we must assume we are in some JIT'ed code (not JIT
+code that LLDB has made) and assume that no registers are available
+in higher stack frames.
 
-### Brief
+## "x" - Binary memory read
 
 Like the `m` (read) and `M` (write) packets, this is a partner to the
 `X` (write binary data) packet, `x`.
@@ -1524,8 +1415,6 @@ transport layer is assumed.
 
 ## Detach and stay stopped
 
-### Description
-
 We extended the "D" packet to specify that the monitor should keep the
 target suspended on detach.  The normal behavior is to resume execution
 on detach.  We will send:
@@ -1546,8 +1435,6 @@ D
 
 ## QSaveRegisterState / QSaveRegisterState;thread:XXXX;
 
-### Brief
-
 The `QSaveRegisterState` packet tells the remote debugserver to save
 all registers and return a non-zero unique integer ID that
 represents these save registers. If thread suffixes are enabled the
@@ -1576,8 +1463,6 @@ for the `QRestoreRegisterState` is added.
 
 ## QRestoreRegisterState:\<save_id\> / QRestoreRegisterState:\<save_id\>;thread:XXXX;
 
-### Brief
-
 The `QRestoreRegisterState` packet tells the remote debugserver to
 restore all registers using the `save_id` which is an unsigned
 integer that was returned from a previous call to
@@ -1601,8 +1486,6 @@ for the `QSaveRegisterState` is added.
 
 ## qFileLoadAddress:\<file_path\>
 
-### Brief
-
 Get the load address of a memory mapped file.
 The load address is defined as the address of the first memory
 region what contains data mapped from the specified file.
@@ -1620,8 +1503,6 @@ some object file in the rendezvous data structure.
 
 ## qModuleInfo:\<module_path\>;\<arch triple\>
 
-### Brief
-
 Get information for a module by given module path and architecture.
 
 ### Response
@@ -1636,8 +1517,6 @@ UUID directly from inferior's memory.
 
 ## jModulesInfo:[{"file":"...",triple:"..."}, ...]
 
-### Brief
-
 Get information for a list of modules by given module path and
 architecture.
 
@@ -1664,14 +1543,10 @@ the communication link has a non-negligible latency.
 
 ## Stop reply packet extensions
 
-### Brief
-
 This section describes some of the additional information you can
 specify in stop reply packets that help LLDB to know more detailed
 information about your threads.
 
-### Description
-
 Standard GDB remote stop reply packets are reply packets sent in
 response to a packet  that made the program run. They come in the
 following forms:
@@ -1880,19 +1755,15 @@ your debug session more reliable and informative.
 
 ## qfProcessInfo / qsProcessInfo (Platform Extension)
 
-### Brief
-
 Get the first process info (`qfProcessInfo`) or subsequent process
 info (`qsProcessInfo`) for one or more processes on the remote
 platform. The first call gets the first match and subsequent calls
 to `qsProcessInfo` gets the subsequent matches. Return an error `EXX`,
 where `XX` are two hex digits, when no more matches are available.
 
-### Priority To Implement
 
-Required. The `qfProcessInfo` packet can be followed by a `:` and
+ The `qfProcessInfo` packet can be followed by a `:` and
 some key value pairs. The key value pairs in the command are:
-
 * `name` - `ascii-hex` -
   An ASCII hex string that contains the name of the process that will be matched.
 * `name_match` - `enum` -
@@ -1933,9 +1804,11 @@ send packet: $qsProcessInfo#00
 read packet: $E04#00
 ```
 
-## qPathComplete (Platform Extension)
+### Priority To Implement
+
+Required.
 
-### Brief
+## qPathComplete (Platform Extension)
 
 Get a list of matched disk files/directories by passing a boolean flag
 and a partial path.
@@ -1958,8 +1831,6 @@ Paths denoting a directory should end with a directory separator (`/` or `\`.
 
 ## qKillSpawnedProcess (Platform Extension)
 
-### Brief
-
 Kill a process running on the target system.
 
 ### Example
@@ -1972,13 +1843,9 @@ The request packet has the process ID in base 10.
 
 ## qLaunchGDBServer (Platform Extension)
 
-### Brief
-
 Have the remote platform launch a GDB server.
 
-### Priority To Implement
-
-Required. The `qLaunchGDBServer` packet must be followed by a `:` and
+The `qLaunchGDBServer` packet must be followed by a `:` and
 some key value pairs. The key value pairs in the command are:
 * `port` - `integer` -
   A string value containing the decimal port ID or zero if the port should be
@@ -1986,11 +1853,6 @@ some key value pairs. The key value pairs in the command are:
 * `host` - `integer` -
   The host that connections should be limited to when the GDB server is connected to.
 
-### Description
-
-The response consists of key/value pairs where the key is separated from the
-values with colons and each pair is terminated with a semi colon.
-
 Sample packet/response:
 ```
 send packet: $qLaunchGDBServer:port:0;host:lldb.apple.com;#00
@@ -2004,20 +1866,15 @@ process was separately launched.
 The `port` key/value pair in the response lets clients know what port number
 to attach to in case zero was specified as the "port" in the sent command.
 
+### Priority To Implement
 
-## qProcessInfoPID:PID (Platform Extension)
+Required.
 
-### Brief
+## qProcessInfoPID:PID (Platform Extension)
 
 Have the remote platform get detailed information on a process by
 ID. PID is specified as a decimal integer.
 
-### Priority To Implement
-
-Optional.
-
-### Description
-
 The response consists of key/value pairs where the key is separated from the
 values with colons and each pair is terminated with a semi colon.
 
@@ -2037,9 +1894,11 @@ send packet: $qProcessInfoPID:60050#00
 read packet: $pid:60050;ppid:59948;uid:7746;gid:11;euid:7746;egid:11;name:6c6c6462;triple:x86_64-apple-macosx;#00
 ```
 
-## vAttachName
+### Priority To Implement
+
+Optional.
 
-### Brief
+## vAttachName
 
 Same as `vAttach`, except instead of a `pid` you send a process name.
 
@@ -2051,8 +1910,6 @@ it if attaching to a process by name makes sense for your environment.
 
 ## vAttachWait
 
-### Brief
-
 Same as `vAttachName`, except that the stub should wait for the next instance
 of a process by that name to be launched and attach to that.
 
@@ -2063,8 +1920,6 @@ gracefully if the packet is not supported.
 
 ## qAttachOrWaitSupported
 
-### Brief
-
 This is a binary "is it supported" query. Return OK if you support
 `vAttachOrWait`.
 
@@ -2076,8 +1931,6 @@ will do the right thing.
 
 ## vAttachOrWait
 
-### Brief
-
 Same as `vAttachWait`, except that the stub will attach to a process
 by name if it exists, and if it does not, it will wait for a process
 of that name to appear and attach to it.
@@ -2094,20 +1947,10 @@ support this packet.
 
 ## jThreadExtendedInfo
 
-### Brief
-
 This packet, which takes its arguments as JSON and sends its reply as
 JSON, allows the gdb remote stub to provide additional information
 about a given thread.
 
-### Priority To Implement
-
-Low.  This packet is only needed if the gdb remote stub wants to
-provide interesting additional information about a thread for the
-user.
-
-### Description
-
 This packet takes its arguments in [JSON](http://www.json.org).
 At a minimum, a thread must be specified, for example:
 ```
@@ -2155,9 +1998,13 @@ like:
 jThreadExtendedInfo:{"thread":612910}]
 ```
 
-## QEnableCompression
+### Priority To Implement
+
+Low.  This packet is only needed if the gdb remote stub wants to
+provide interesting additional information about a thread for the
+user.
 
-### Brief
+## QEnableCompression
 
 This packet enables compression of the packets that the debug stub sends to lldb.
 If the debug stub can support compression, it indictes this in the reply of the
@@ -2218,8 +2065,6 @@ Example compression algorithms that may be used include:
 
 ## jGetLoadedDynamicLibrariesInfos
 
-### Brief
-
 This packet asks the remote debug stub to send the details about libraries
 being added/removed from the process as a performance optimization.
 
@@ -2293,8 +2138,6 @@ STUB REPLIES: ${"images":
             }
 ```
 
-### Description
-
 This is similar to the `qXfer:libraries:read` packet, and it could
 be argued that it should be merged into that packet.  A separate
 packet was created primarily because lldb needs to specify the
@@ -2313,18 +2156,8 @@ executable loaded.
 
 ## jThreadsInfo
 
-### Brief
-
 Ask for the server for thread stop information of all threads.
 
-### Priority To Implement
-
-Low. This is a performance optimization, which speeds up debugging by avoiding
-multiple round-trips for retrieving thread information. The information from this
-packet can be retrieved using a combination of `qThreadStopInfo` and `m` packets.
-
-### Description
-
 The data in this packet is very similar to the stop reply packets, but is packaged in
 JSON and uses JSON arrays where applicable. The JSON output looks like:
 ```
@@ -2379,9 +2212,13 @@ On macOS with debugserver, we expedite the frame pointer backchain for a thread
 the previous FP and PC), and follow the backchain. Most backtraces on macOS and
 iOS now don't require us to read any memory!
 
-## jGetSharedCacheInfo
+### Priority To Implement
 
-### Brief
+Low. This is a performance optimization, which speeds up debugging by avoiding
+multiple round-trips for retrieving thread information. The information from this
+packet can be retrieved using a combination of `qThreadStopInfo` and `m` packets.
+
+## jGetSharedCacheInfo
 
 This packet asks the remote debug stub to send the details about the inferior's
 shared cache. The shared cache is a collection of common libraries/frameworks that
@@ -2402,17 +2239,8 @@ them from the inferior process.
 
 ## qQueryGDBServer
 
-### Brief
-
 Ask the platform for the list of gdbservers we have to connect
 
-### Priority To Implement
-
-Low. The packet is required to support connecting to gdbserver started
-by the platform instance automatically.
-
-### Description
-
 If the remote platform automatically started one or more gdbserver instance (without
 lldb asking it) then it have to return the list of port number or socket name for
 each of them what can be used by lldb to connect to those instances.
@@ -2430,29 +2258,28 @@ Example packet:
 ]
 ```
 
-## QSetDetachOnError
+### Priority To Implement
 
-### Brief
+Low. The packet is required to support connecting to gdbserver started
+by the platform instance automatically.
+
+## QSetDetachOnError
 
 Sets what the server should do when the communication channel with LLDB
 goes down. Either kill the inferior process (`0`) or remove breakpoints and
 detach (`1`).
 
+The data in this packet is a single a character, which should be `0` if the
+inferior process should be killed, or `1` if the server should remove all
+breakpoints and detach from the inferior.
+
 ### Priority To Implement
 
 Low. Only required if the target wants to keep the inferior process alive
 when the communication channel goes down.
 
-### Description
-
-The data in this packet is a single a character, which should be `0` if the
-inferior process should be killed, or `1` if the server should remove all
-breakpoints and detach from the inferior.
-
 ## jGetDyldProcessState
 
-### Brief
-
 This packet fetches the process launch state, as reported by libdyld on
 Darwin systems, most importantly to indicate when the system libraries
 have initialized sufficiently to safely call utility functions.
@@ -2476,8 +2303,6 @@ mismatches or extensions.
 
 ### vFile:size
 
-#### Brief
-
 Get the size of a file on the target system, filename in ASCII hex.
 
 #### Example
@@ -2492,8 +2317,6 @@ response is `F` followed by the file size in base 16.
 
 ### vFile:mode
 
-#### Brief
-
 Get the mode bits of a file on the target system, filename in ASCII hex.
 
 #### Example
@@ -2509,8 +2332,6 @@ correspond to `0755` in octal.
 
 ### vFile:unlink
 
-#### Brief
-
 Remove a file on the target system.
 
 #### Example
@@ -2527,8 +2348,6 @@ value of errno if unlink failed.
 
 ### vFile:symlink
 
-#### Brief
-
 Create a symbolic link (symlink, soft-link) on the target system.
 
 #### Example
@@ -2544,8 +2363,6 @@ optionally followed by the value of errno if it failed, also base 16.
 
 ### vFile:open
 
-#### Brief
-
 Open a file on the remote system and return the file descriptor of it.
 
 #### Example
@@ -2568,8 +2385,6 @@ response is `F` followed by the opened file descriptor in base 16.
 
 ### vFile:close
 
-#### Brief
-
 Close a previously opened file descriptor.
 
 #### Example
@@ -2584,8 +2399,6 @@ errno is base 16.
 
 ### vFile:pread
 
-#### Brief
-
 Read data from an opened file descriptor.
 
 #### Example
@@ -2605,8 +2418,6 @@ semicolon, followed by the data in the binary-escaped-data encoding.
 
 ### vFile:pwrite
 
-#### Brief
-
 Write data to a previously opened file descriptor.
 
 #### Example
@@ -2625,8 +2436,6 @@ Response is `F`, followed by the number of bytes written (base 16).
 
 ### vFile:MD5
 
-#### Brief
-
 Generate an MD5 hash of the file at the given path.
 
 #### Example
@@ -2648,8 +2457,6 @@ or failed to hash.
 
 ### vFile:exists
 
-#### Brief
-
 Check whether the file at the given path exists.
 
 #### Example
diff --git a/lldb/include/lldb/API/SBExpressionOptions.h b/lldb/include/lldb/API/SBExpressionOptions.h
index e0ddfda5ba37..19c416d0f3bc 100644
--- a/lldb/include/lldb/API/SBExpressionOptions.h
+++ b/lldb/include/lldb/API/SBExpressionOptions.h
@@ -10,6 +10,7 @@
 #define LLDB_API_SBEXPRESSIONOPTIONS_H
 
 #include "lldb/API/SBDefines.h"
+#include "lldb/API/SBLanguages.h"
 
 #include <vector>
 
@@ -67,6 +68,10 @@ public:
   void SetTrapExceptions(bool trap_exceptions = true);
 
   void SetLanguage(lldb::LanguageType language);
+  /// Set the language using a pair of language code and version as
+  /// defined by the DWARF 6 specification.
+  /// WARNING: These codes may change until DWARF 6 is finalized.
+  void SetLanguage(SBSourceLanguageName name, uint32_t version);
 
 #ifndef SWIG
   void SetCancelCallback(lldb::ExpressionCancelCallback callback, void *baton);
diff --git a/lldb/include/lldb/Expression/Expression.h b/lldb/include/lldb/Expression/Expression.h
index 3e61d78828bb..356fe4b82ae4 100644
--- a/lldb/include/lldb/Expression/Expression.h
+++ b/lldb/include/lldb/Expression/Expression.h
@@ -47,11 +47,8 @@ public:
   /// expression.  Text() should contain the definition of this function.
   virtual const char *FunctionName() = 0;
 
-  /// Return the language that should be used when parsing.  To use the
-  /// default, return eLanguageTypeUnknown.
-  virtual lldb::LanguageType Language() const {
-    return lldb::eLanguageTypeUnknown;
-  }
+  /// Return the language that should be used when parsing.
+  virtual SourceLanguage Language() const { return {}; }
 
   /// Return the Materializer that the parser should use when registering
   /// external values.
diff --git a/lldb/include/lldb/Expression/LLVMUserExpression.h b/lldb/include/lldb/Expression/LLVMUserExpression.h
index 7d32d17dbf54..40b463933c07 100644
--- a/lldb/include/lldb/Expression/LLVMUserExpression.h
+++ b/lldb/include/lldb/Expression/LLVMUserExpression.h
@@ -52,7 +52,7 @@ public:
   };
 
   LLVMUserExpression(ExecutionContextScope &exe_scope, llvm::StringRef expr,
-                     llvm::StringRef prefix, lldb::LanguageType language,
+                     llvm::StringRef prefix, SourceLanguage language,
                      ResultType desired_type,
                      const EvaluateExpressionOptions &options);
   ~LLVMUserExpression() override;
diff --git a/lldb/include/lldb/Expression/UserExpression.h b/lldb/include/lldb/Expression/UserExpression.h
index b6cfeec7e899..b04d00b72e8f 100644
--- a/lldb/include/lldb/Expression/UserExpression.h
+++ b/lldb/include/lldb/Expression/UserExpression.h
@@ -56,7 +56,7 @@ public:
   ///     If not eResultTypeAny, the type to use for the expression
   ///     result.
   UserExpression(ExecutionContextScope &exe_scope, llvm::StringRef expr,
-                 llvm::StringRef prefix, lldb::LanguageType language,
+                 llvm::StringRef prefix, SourceLanguage language,
                  ResultType desired_type,
                  const EvaluateExpressionOptions &options);
 
@@ -202,7 +202,7 @@ public:
   virtual bool IsParseCacheable() { return true; }
   /// Return the language that should be used when parsing.  To use the
   /// default, return eLanguageTypeUnknown.
-  lldb::LanguageType Language() const override { return m_language; }
+  SourceLanguage Language() const override { return m_language; }
 
   /// Return the desired result type of the function, or eResultTypeAny if
   /// indifferent.
@@ -315,19 +315,22 @@ protected:
                            lldb::ProcessSP &process_sp,
                            lldb::StackFrameSP &frame_sp);
 
-  Address m_address;       ///< The address the process is stopped in.
-  std::string m_expr_text; ///< The text of the expression, as typed by the user
-  std::string m_expr_prefix; ///< The text of the translation-level definitions,
-                             ///as provided by the user
-  std::string m_fixed_text; ///< The text of the expression with fix-its applied
-                            ///- this won't be set if the fixed text doesn't
-                            ///parse.
-  lldb::LanguageType m_language; ///< The language to use when parsing
-                                 ///(eLanguageTypeUnknown means use defaults)
-  ResultType m_desired_type; ///< The type to coerce the expression's result to.
-                             ///If eResultTypeAny, inferred from the expression.
-  EvaluateExpressionOptions
-      m_options; ///< Additional options provided by the user.
+  /// The address the process is stopped in.
+  Address m_address;
+  /// The text of the expression, as typed by the user.
+  std::string m_expr_text;
+  /// The text of the translation-level definitions, as provided by the user.
+  std::string m_expr_prefix;
+  /// The text of the expression with fix-its applied this won't be set if the
+  /// fixed text doesn't parse.
+  std::string m_fixed_text;
+  /// The language to use when parsing (unknown means use defaults).
+  SourceLanguage m_language;
+  /// The type to coerce the expression's result to. If eResultTypeAny, inferred
+  /// from the expression.
+  ResultType m_desired_type;
+  /// Additional options provided by the user.
+  EvaluateExpressionOptions m_options;
 };
 
 } // namespace lldb_private
diff --git a/lldb/include/lldb/Symbol/TypeSystem.h b/lldb/include/lldb/Symbol/TypeSystem.h
index ad3b853227a9..0924e21a6b54 100644
--- a/lldb/include/lldb/Symbol/TypeSystem.h
+++ b/lldb/include/lldb/Symbol/TypeSystem.h
@@ -483,12 +483,10 @@ public:
     return IsPointerOrReferenceType(type, nullptr);
   }
 
-  virtual UserExpression *
-  GetUserExpression(llvm::StringRef expr, llvm::StringRef prefix,
-                    lldb::LanguageType language,
-                    Expression::ResultType desired_type,
-                    const EvaluateExpressionOptions &options,
-                    ValueObject *ctx_obj) {
+  virtual UserExpression *GetUserExpression(
+      llvm::StringRef expr, llvm::StringRef prefix, SourceLanguage language,
+      Expression::ResultType desired_type,
+      const EvaluateExpressionOptions &options, ValueObject *ctx_obj) {
     return nullptr;
   }
 
diff --git a/lldb/include/lldb/Target/StackFrame.h b/lldb/include/lldb/Target/StackFrame.h
index 6c18511c6e1a..52f0a1ee6621 100644
--- a/lldb/include/lldb/Target/StackFrame.h
+++ b/lldb/include/lldb/Target/StackFrame.h
@@ -1,3 +1,4 @@
+
 //===-- StackFrame.h --------------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
@@ -446,13 +447,12 @@ public:
   /// Query this frame to determine what the default language should be when
   /// parsing expressions given the execution context.
   ///
-  /// \return
-  ///   The language of the frame if known, else lldb::eLanguageTypeUnknown.
-  lldb::LanguageType GetLanguage();
+  /// \return   The language of the frame if known.
+  SourceLanguage GetLanguage();
 
-  // similar to GetLanguage(), but is allowed to take a potentially incorrect
-  // guess if exact information is not available
-  lldb::LanguageType GuessLanguage();
+  /// Similar to GetLanguage(), but is allowed to take a potentially incorrect
+  /// guess if exact information is not available.
+  SourceLanguage GuessLanguage();
 
   /// Attempt to econstruct the ValueObject for a given raw address touched by
   /// the current instruction.  The ExpressionPath should indicate how to get
diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h
index cade60f3cc8c..7ad9f3358605 100644
--- a/lldb/include/lldb/Target/Target.h
+++ b/lldb/include/lldb/Target/Target.h
@@ -200,7 +200,7 @@ public:
 
   bool GetBreakpointsConsultPlatformAvoidList();
 
-  lldb::LanguageType GetLanguage() const;
+  SourceLanguage GetLanguage() const;
 
   llvm::StringRef GetExpressionPrefixContents();
 
@@ -310,9 +310,18 @@ public:
     m_execution_policy = policy;
   }
 
-  lldb::LanguageType GetLanguage() const { return m_language; }
+  SourceLanguage GetLanguage() const { return m_language; }
 
-  void SetLanguage(lldb::LanguageType language) { m_language = language; }
+  void SetLanguage(lldb::LanguageType language_type) {
+    m_language = SourceLanguage(language_type);
+  }
+
+  /// Set the language using a pair of language code and version as
+  /// defined by the DWARF 6 specification.
+  /// WARNING: These codes may change until DWARF 6 is finalized.
+  void SetLanguage(uint16_t name, uint32_t version) {
+    m_language = SourceLanguage(name, version);
+  }
 
   bool DoesCoerceToId() const { return m_coerce_to_id; }
 
@@ -445,7 +454,7 @@ public:
 
 private:
   ExecutionPolicy m_execution_policy = default_execution_policy;
-  lldb::LanguageType m_language = lldb::eLanguageTypeUnknown;
+  SourceLanguage m_language;
   std::string m_prefix;
   bool m_coerce_to_id = false;
   bool m_unwind_on_error = true;
@@ -1160,7 +1169,7 @@ public:
 
   UserExpression *
   GetUserExpressionForLanguage(llvm::StringRef expr, llvm::StringRef prefix,
-                               lldb::LanguageType language,
+                               SourceLanguage language,
                                Expression::ResultType desired_type,
                                const EvaluateExpressionOptions &options,
                                ValueObject *ctx_obj, Status &error);
diff --git a/lldb/include/lldb/lldb-private-types.h b/lldb/include/lldb/lldb-private-types.h
index 7d301666df1a..055eea9f6456 100644
--- a/lldb/include/lldb/lldb-private-types.h
+++ b/lldb/include/lldb/lldb-private-types.h
@@ -96,6 +96,25 @@ struct RegisterSet {
   const uint32_t *registers;
 };
 
+/// A type-erased pair of llvm::dwarf::SourceLanguageName and version.
+struct SourceLanguage {
+  SourceLanguage() = default;
+  SourceLanguage(lldb::LanguageType language_type);
+  SourceLanguage(uint16_t name, uint32_t version)
+      : name(name), version(version) {}
+  SourceLanguage(std::optional<std::pair<uint16_t, uint32_t>> name_vers)
+      : name(name_vers ? name_vers->first : 0),
+        version(name_vers ? name_vers->second : 0) {}
+  operator bool() const { return name > 0; }
+  lldb::LanguageType AsLanguageType() const;
+  llvm::StringRef GetDescription() const;
+  bool IsC() const;
+  bool IsObjC() const;
+  bool IsCPlusPlus() const;
+  uint16_t name = 0;
+  uint32_t version = 0;
+};
+
 struct OptionEnumValueElement {
   int64_t value;
   const char *string_value;
diff --git a/lldb/packages/Python/lldbsuite/test/configuration.py b/lldb/packages/Python/lldbsuite/test/configuration.py
index 685f491c85fe..dbd4a2d72a15 100644
--- a/lldb/packages/Python/lldbsuite/test/configuration.py
+++ b/lldb/packages/Python/lldbsuite/test/configuration.py
@@ -119,6 +119,7 @@ all_tests = set()
 
 # LLDB library directory.
 lldb_libs_dir = None
+lldb_obj_root = None
 
 libcxx_include_dir = None
 libcxx_include_target_dir = None
diff --git a/lldb/packages/Python/lldbsuite/test/decorators.py b/lldb/packages/Python/lldbsuite/test/decorators.py
index 8e13aa6a1388..7fb88cef1653 100644
--- a/lldb/packages/Python/lldbsuite/test/decorators.py
+++ b/lldb/packages/Python/lldbsuite/test/decorators.py
@@ -206,6 +206,7 @@ def _decorateTest(
     remote=None,
     dwarf_version=None,
     setting=None,
+    asan=None,
 ):
     def fn(actual_debug_info=None):
         skip_for_os = _match_decorator_property(
@@ -256,6 +257,7 @@ def _decorateTest(
             )
         )
         skip_for_setting = (setting is None) or (setting in configuration.settings)
+        skip_for_asan = (asan is None) or is_running_under_asan()
 
         # For the test to be skipped, all specified (e.g. not None) parameters must be True.
         # An unspecified parameter means "any", so those are marked skip by default.  And we skip
@@ -273,6 +275,7 @@ def _decorateTest(
             (remote, skip_for_remote, "platform locality (remote/local)"),
             (dwarf_version, skip_for_dwarf_version, "dwarf version"),
             (setting, skip_for_setting, "setting"),
+            (asan, skip_for_asan, "running under asan"),
         ]
         reasons = []
         final_skip_result = True
@@ -331,6 +334,7 @@ def expectedFailureAll(
     remote=None,
     dwarf_version=None,
     setting=None,
+    asan=None,
 ):
     return _decorateTest(
         DecorateMode.Xfail,
@@ -348,6 +352,7 @@ def expectedFailureAll(
         remote=remote,
         dwarf_version=dwarf_version,
         setting=setting,
+        asan=asan,
     )
 
 
@@ -356,7 +361,7 @@ def expectedFailureAll(
 # for example,
 # @skipIf, skip for all platform/compiler/arch,
 # @skipIf(compiler='gcc'), skip for gcc on all platform/architecture
-# @skipIf(bugnumber, ["linux"], "gcc", ['>=', '4.9'], ['i386']), skip for gcc>=4.9 on linux with i386
+# @skipIf(bugnumber, ["linux"], "gcc", ['>=', '4.9'], ['i386']), skip for gcc>=4.9 on linux with i386 (all conditions must be true)
 def skipIf(
     bugnumber=None,
     oslist=None,
@@ -372,6 +377,7 @@ def skipIf(
     remote=None,
     dwarf_version=None,
     setting=None,
+    asan=None,
 ):
     return _decorateTest(
         DecorateMode.Skip,
@@ -389,6 +395,7 @@ def skipIf(
         remote=remote,
         dwarf_version=dwarf_version,
         setting=setting,
+        asan=asan,
     )
 
 
diff --git a/lldb/packages/Python/lldbsuite/test/dotest.py b/lldb/packages/Python/lldbsuite/test/dotest.py
index 2ec4a840b916..ebabf348643e 100644
--- a/lldb/packages/Python/lldbsuite/test/dotest.py
+++ b/lldb/packages/Python/lldbsuite/test/dotest.py
@@ -423,6 +423,7 @@ def parseOptionsAndInitTestdirs():
         configuration.lldb_module_cache_dir = os.path.join(
             configuration.test_build_dir, "module-cache-lldb"
         )
+
     if args.clang_module_cache_dir:
         configuration.clang_module_cache_dir = args.clang_module_cache_dir
     else:
@@ -432,6 +433,8 @@ def parseOptionsAndInitTestdirs():
 
     if args.lldb_libs_dir:
         configuration.lldb_libs_dir = args.lldb_libs_dir
+    if args.lldb_obj_root:
+        configuration.lldb_obj_root = args.lldb_obj_root
 
     if args.enabled_plugins:
         configuration.enabled_plugins = args.enabled_plugins
diff --git a/lldb/packages/Python/lldbsuite/test/dotest_args.py b/lldb/packages/Python/lldbsuite/test/dotest_args.py
index e4de786ec054..8b00c7a4d56e 100644
--- a/lldb/packages/Python/lldbsuite/test/dotest_args.py
+++ b/lldb/packages/Python/lldbsuite/test/dotest_args.py
@@ -237,10 +237,16 @@ def create_parser():
         help="The clang module cache directory used in the Make files by Clang while building tests. Defaults to <test build directory>/module-cache-clang.",
     )
     group.add_argument(
+        "--lldb-obj-root",
+        dest="lldb_obj_root",
+        metavar="path",
+        help="The path to the LLDB object files.",
+    )
+    group.add_argument(
         "--lldb-libs-dir",
         dest="lldb_libs_dir",
         metavar="path",
-        help="The path to LLDB library directory (containing liblldb)",
+        help="The path to LLDB library directory (containing liblldb).",
     )
     group.add_argument(
         "--enable-plugin",
diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index 7a7afec73457..591e834c7cdf 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -1473,11 +1473,12 @@ class Base(unittest.TestCase):
             d = {
                 "CXX_SOURCES": sources,
                 "EXE": exe_name,
-                "CFLAGS_EXTRAS": "%s %s -I%s"
+                "CFLAGS_EXTRAS": "%s %s -I%s -I%s"
                 % (
                     stdflag,
                     stdlibflag,
                     os.path.join(os.environ["LLDB_SRC"], "include"),
+                    os.path.join(configuration.lldb_obj_root, "include"),
                 ),
                 "LD_EXTRAS": "-L%s -lliblldb" % lib_dir,
             }
@@ -1485,11 +1486,12 @@ class Base(unittest.TestCase):
             d = {
                 "CXX_SOURCES": sources,
                 "EXE": exe_name,
-                "CFLAGS_EXTRAS": "%s %s -I%s"
+                "CFLAGS_EXTRAS": "%s %s -I%s -I%s"
                 % (
                     stdflag,
                     stdlibflag,
                     os.path.join(os.environ["LLDB_SRC"], "include"),
+                    os.path.join(configuration.lldb_obj_root, "include"),
                 ),
                 "LD_EXTRAS": "-L%s -llldb -Wl,-rpath,%s" % (lib_dir, lib_dir),
             }
@@ -1508,7 +1510,8 @@ class Base(unittest.TestCase):
             d = {
                 "DYLIB_CXX_SOURCES": sources,
                 "DYLIB_NAME": lib_name,
-                "CFLAGS_EXTRAS": "%s -stdlib=libc++" % stdflag,
+                "CFLAGS_EXTRAS": "%s -stdlib=libc++ -I%s"
+                % (stdflag, os.path.join(configuration.lldb_obj_root, "include")),
                 "FRAMEWORK_INCLUDES": "-F%s" % self.framework_dir,
                 "LD_EXTRAS": "%s -Wl,-rpath,%s -dynamiclib"
                 % (self.lib_lldb, self.framework_dir),
@@ -1517,16 +1520,24 @@ class Base(unittest.TestCase):
             d = {
                 "DYLIB_CXX_SOURCES": sources,
                 "DYLIB_NAME": lib_name,
-                "CFLAGS_EXTRAS": "%s -I%s "
-                % (stdflag, os.path.join(os.environ["LLDB_SRC"], "include")),
+                "CFLAGS_EXTRAS": "%s -I%s -I%s"
+                % (
+                    stdflag,
+                    os.path.join(os.environ["LLDB_SRC"], "include"),
+                    os.path.join(configuration.lldb_obj_root, "include"),
+                ),
                 "LD_EXTRAS": "-shared -l%s\liblldb.lib" % lib_dir,
             }
         else:
             d = {
                 "DYLIB_CXX_SOURCES": sources,
                 "DYLIB_NAME": lib_name,
-                "CFLAGS_EXTRAS": "%s -I%s -fPIC"
-                % (stdflag, os.path.join(os.environ["LLDB_SRC"], "include")),
+                "CFLAGS_EXTRAS": "%s -I%s -I%s -fPIC"
+                % (
+                    stdflag,
+                    os.path.join(os.environ["LLDB_SRC"], "include"),
+                    os.path.join(configuration.lldb_obj_root, "include"),
+                ),
                 "LD_EXTRAS": "-shared -L%s -llldb -Wl,-rpath,%s" % (lib_dir, lib_dir),
             }
         if self.TraceOn():
diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt
index 57cc44f76467..ad960403ae70 100644
--- a/lldb/source/API/CMakeLists.txt
+++ b/lldb/source/API/CMakeLists.txt
@@ -20,6 +20,10 @@ if(LLDB_ENABLE_LUA)
   set(lldb_lua_wrapper ${lua_bindings_dir}/LLDBWrapLua.cpp)
 endif()
 
+lldb_tablegen(../../include/lldb/API/SBLanguages.h -gen-lldb-sbapi-dwarf-enum
+  SOURCE ${LLVM_MAIN_INCLUDE_DIR}/llvm/BinaryFormat/Dwarf.def
+  TARGET lldb-sbapi-dwarf-enums)
+
 add_lldb_library(liblldb SHARED ${option_framework}
   SBAddress.cpp
   SBAttachInfo.cpp
@@ -100,6 +104,9 @@ add_lldb_library(liblldb SHARED ${option_framework}
   ${lldb_python_wrapper}
   ${lldb_lua_wrapper}
 
+  DEPENDS
+    lldb-sbapi-dwarf-enums
+  
   LINK_LIBS
     lldbBreakpoint
     lldbCore
diff --git a/lldb/source/API/SBExpressionOptions.cpp b/lldb/source/API/SBExpressionOptions.cpp
index bd81a04596b1..ce686112ff71 100644
--- a/lldb/source/API/SBExpressionOptions.cpp
+++ b/lldb/source/API/SBExpressionOptions.cpp
@@ -156,6 +156,13 @@ void SBExpressionOptions::SetLanguage(lldb::LanguageType language) {
   m_opaque_up->SetLanguage(language);
 }
 
+void SBExpressionOptions::SetLanguage(SBSourceLanguageName name,
+                                      uint32_t version) {
+  LLDB_INSTRUMENT_VA(this, name, version);
+
+  m_opaque_up->SetLanguage(name, version);
+}
+
 void SBExpressionOptions::SetCancelCallback(
     lldb::ExpressionCancelCallback callback, void *baton) {
   LLDB_INSTRUMENT_VA(this, callback, baton);
diff --git a/lldb/source/API/SBFrame.cpp b/lldb/source/API/SBFrame.cpp
index a16bbc2ae756..47fc88625e30 100644
--- a/lldb/source/API/SBFrame.cpp
+++ b/lldb/source/API/SBFrame.cpp
@@ -1024,10 +1024,10 @@ SBValue SBFrame::EvaluateExpression(const char *expr) {
     options.SetFetchDynamicValue(fetch_dynamic_value);
     options.SetUnwindOnError(true);
     options.SetIgnoreBreakpoints(true);
-    if (target->GetLanguage() != eLanguageTypeUnknown)
-      options.SetLanguage(target->GetLanguage());
-    else
-      options.SetLanguage(frame->GetLanguage());
+    SourceLanguage language = target->GetLanguage();
+    if (!language)
+      language = frame->GetLanguage();
+    options.SetLanguage((SBSourceLanguageName)language.name, language.version);
     return EvaluateExpression(expr, options);
   } else {
     Status error;
@@ -1053,10 +1053,12 @@ SBFrame::EvaluateExpression(const char *expr,
 
   StackFrame *frame = exe_ctx.GetFramePtr();
   Target *target = exe_ctx.GetTargetPtr();
-  if (target && target->GetLanguage() != eLanguageTypeUnknown)
-    options.SetLanguage(target->GetLanguage());
-  else if (frame)
-    options.SetLanguage(frame->GetLanguage());
+  SourceLanguage language;
+  if (target)
+    language = target->GetLanguage();
+  if (!language && frame)
+    language = frame->GetLanguage();
+  options.SetLanguage((SBSourceLanguageName)language.name, language.version);
   return EvaluateExpression(expr, options);
 }
 
@@ -1074,10 +1076,12 @@ SBValue SBFrame::EvaluateExpression(const char *expr,
   options.SetIgnoreBreakpoints(true);
   StackFrame *frame = exe_ctx.GetFramePtr();
   Target *target = exe_ctx.GetTargetPtr();
-  if (target && target->GetLanguage() != eLanguageTypeUnknown)
-    options.SetLanguage(target->GetLanguage());
-  else if (frame)
-    options.SetLanguage(frame->GetLanguage());
+  SourceLanguage language;
+  if (target)
+    language = target->GetLanguage();
+  if (!language && frame)
+    language = frame->GetLanguage();
+  options.SetLanguage((SBSourceLanguageName)language.name, language.version);
   return EvaluateExpression(expr, options);
 }
 
@@ -1218,7 +1222,7 @@ lldb::LanguageType SBFrame::GuessLanguage() const {
     if (stop_locker.TryLock(&process->GetRunLock())) {
       frame = exe_ctx.GetFramePtr();
       if (frame) {
-        return frame->GuessLanguage();
+        return frame->GuessLanguage().AsLanguageType();
       }
     }
   }
diff --git a/lldb/source/Breakpoint/Watchpoint.cpp b/lldb/source/Breakpoint/Watchpoint.cpp
index a128ced57504..edb1a0e93460 100644
--- a/lldb/source/Breakpoint/Watchpoint.cpp
+++ b/lldb/source/Breakpoint/Watchpoint.cpp
@@ -460,9 +460,8 @@ void Watchpoint::SetCondition(const char *condition) {
     // Pass nullptr for expr_prefix (no translation-unit level definitions).
     Status error;
     m_condition_up.reset(m_target.GetUserExpressionForLanguage(
-        condition, llvm::StringRef(), lldb::eLanguageTypeUnknown,
-        UserExpression::eResultTypeAny, EvaluateExpressionOptions(), nullptr,
-        error));
+        condition, {}, {}, UserExpression::eResultTypeAny,
+        EvaluateExpressionOptions(), nullptr, error));
     if (error.Fail()) {
       // FIXME: Log something...
       m_condition_up.reset();
diff --git a/lldb/source/Commands/CommandObjectDWIMPrint.cpp b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
index e1255f37d9bc..57a372a762e1 100644
--- a/lldb/source/Commands/CommandObjectDWIMPrint.cpp
+++ b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
@@ -93,10 +93,10 @@ void CommandObjectDWIMPrint::DoExecute(StringRef command,
 
   StackFrame *frame = m_exe_ctx.GetFramePtr();
 
-  // Either Swift was explicitly specified, or the frame is Swift.
+  // Either the language was explicitly specified, or we check the frame.
   lldb::LanguageType language = m_expr_options.language;
   if (language == lldb::eLanguageTypeUnknown && frame)
-    language = frame->GuessLanguage();
+    language = frame->GuessLanguage().AsLanguageType();
 
   // Add a hint if object description was requested, but no description
   // function was implemented.
diff --git a/lldb/source/Commands/CommandObjectType.cpp b/lldb/source/Commands/CommandObjectType.cpp
index 97489bdc2d9c..46537dd1b98a 100644
--- a/lldb/source/Commands/CommandObjectType.cpp
+++ b/lldb/source/Commands/CommandObjectType.cpp
@@ -2509,7 +2509,7 @@ protected:
     if (!frame)
       return lang_type;
 
-    lang_type = frame->GuessLanguage();
+    lang_type = frame->GuessLanguage().AsLanguageType();
     if (lang_type != lldb::eLanguageTypeUnknown)
       return lang_type;
 
diff --git a/lldb/source/Expression/LLVMUserExpression.cpp b/lldb/source/Expression/LLVMUserExpression.cpp
index 9c31cc84bf8f..1434011c80ad 100644
--- a/lldb/source/Expression/LLVMUserExpression.cpp
+++ b/lldb/source/Expression/LLVMUserExpression.cpp
@@ -42,7 +42,7 @@ char LLVMUserExpression::ID;
 LLVMUserExpression::LLVMUserExpression(ExecutionContextScope &exe_scope,
                                        llvm::StringRef expr,
                                        llvm::StringRef prefix,
-                                       lldb::LanguageType language,
+                                       SourceLanguage language,
                                        ResultType desired_type,
                                        const EvaluateExpressionOptions &options)
     : UserExpression(exe_scope, expr, prefix, language, desired_type, options),
diff --git a/lldb/source/Expression/UserExpression.cpp b/lldb/source/Expression/UserExpression.cpp
index c181712a2f0b..5658426c8891 100644
--- a/lldb/source/Expression/UserExpression.cpp
+++ b/lldb/source/Expression/UserExpression.cpp
@@ -39,6 +39,7 @@
 #include "lldb/Utility/Log.h"
 #include "lldb/Utility/State.h"
 #include "lldb/Utility/StreamString.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 
 using namespace lldb_private;
 
@@ -46,8 +47,7 @@ char UserExpression::ID;
 
 UserExpression::UserExpression(ExecutionContextScope &exe_scope,
                                llvm::StringRef expr, llvm::StringRef prefix,
-                               lldb::LanguageType language,
-                               ResultType desired_type,
+                               SourceLanguage language, ResultType desired_type,
                                const EvaluateExpressionOptions &options)
     : Expression(exe_scope), m_expr_text(std::string(expr)),
       m_expr_prefix(std::string(prefix)), m_language(language),
@@ -176,7 +176,7 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx,
   }
 
   lldb_private::ExecutionPolicy execution_policy = options.GetExecutionPolicy();
-  lldb::LanguageType language = options.GetLanguage();
+  SourceLanguage language = options.GetLanguage();
   const ResultType desired_type = options.DoesCoerceToId()
                                       ? UserExpression::eResultTypeId
                                       : UserExpression::eResultTypeAny;
@@ -242,7 +242,7 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx,
   // If the language was not specified in the expression command, set it to the
   // language in the target's properties if specified, else default to the
   // langage for the frame.
-  if (language == lldb::eLanguageTypeUnknown) {
+  if (!language) {
     if (target->GetLanguage() != lldb::eLanguageTypeUnknown)
       language = target->GetLanguage();
     else if (StackFrame *frame = exe_ctx.GetFramePtr())
@@ -384,7 +384,8 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx,
       } else {
         if (expr_result) {
           result_valobj_sp = expr_result->GetValueObject();
-          result_valobj_sp->SetPreferredDisplayLanguage(language);
+          result_valobj_sp->SetPreferredDisplayLanguage(
+              language.AsLanguageType());
 
           LLDB_LOG(log,
                    "== [UserExpression::Evaluate] Execution completed "
@@ -426,7 +427,8 @@ UserExpression::Execute(DiagnosticManager &diagnostic_manager,
   Target *target = exe_ctx.GetTargetPtr();
   if (options.GetSuppressPersistentResult() && result_var && target) {
     if (auto *persistent_state =
-            target->GetPersistentExpressionStateForLanguage(m_language))
+            target->GetPersistentExpressionStateForLanguage(
+                m_language.AsLanguageType()))
       persistent_state->RemovePersistentVariable(result_var);
   }
   return expr_result;
diff --git a/lldb/source/Expression/UtilityFunction.cpp b/lldb/source/Expression/UtilityFunction.cpp
index d7a3c9d41d04..7b34c2c2ff76 100644
--- a/lldb/source/Expression/UtilityFunction.cpp
+++ b/lldb/source/Expression/UtilityFunction.cpp
@@ -80,8 +80,8 @@ FunctionCaller *UtilityFunction::MakeFunctionCaller(
   name.append("-caller");
 
   m_caller_up.reset(process_sp->GetTarget().GetFunctionCallerForLanguage(
-      Language(), return_type, impl_code_address, arg_value_list, name.c_str(),
-      error));
+      Language().AsLanguageType(), return_type, impl_code_address,
+      arg_value_list, name.c_str(), error));
   if (error.Fail()) {
 
     return nullptr;
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
index f48bdc730d91..72c7cda13ecb 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
@@ -392,8 +392,8 @@ ClangExpressionParser::ClangExpressionParser(
   // Make sure clang uses the same VFS as LLDB.
   m_compiler->createFileManager(FileSystem::Instance().GetVirtualFileSystem());
 
-  lldb::LanguageType frame_lang =
-      expr.Language(); // defaults to lldb::eLanguageTypeUnknown
+  // Defaults to lldb::eLanguageTypeUnknown.
+  lldb::LanguageType frame_lang = expr.Language().AsLanguageType();
 
   std::string abi;
   ArchSpec target_arch;
@@ -410,7 +410,7 @@ ClangExpressionParser::ClangExpressionParser(
   // Make sure the user hasn't provided a preferred execution language with
   // `expression --language X -- ...`
   if (frame_sp && frame_lang == lldb::eLanguageTypeUnknown)
-    frame_lang = frame_sp->GetLanguage();
+    frame_lang = frame_sp->GetLanguage().AsLanguageType();
 
   if (process_sp && frame_lang != lldb::eLanguageTypeUnknown) {
     LLDB_LOGF(log, "Frame has language of type %s",
@@ -479,7 +479,7 @@ ClangExpressionParser::ClangExpressionParser(
   assert(m_compiler->hasTarget());
 
   // 4. Set language options.
-  lldb::LanguageType language = expr.Language();
+  lldb::LanguageType language = expr.Language().AsLanguageType();
   LangOptions &lang_opts = m_compiler->getLangOpts();
 
   switch (language) {
@@ -1344,10 +1344,10 @@ lldb_private::Status ClangExpressionParser::PrepareForExecution(
   {
     auto lang = m_expr.Language();
     LLDB_LOGF(log, "%s - Current expression language is %s\n", __FUNCTION__,
-              Language::GetNameForLanguageType(lang));
+              lang.GetDescription().data());
     lldb::ProcessSP process_sp = exe_ctx.GetProcessSP();
     if (process_sp && lang != lldb::eLanguageTypeUnknown) {
-      auto runtime = process_sp->GetLanguageRuntime(lang);
+      auto runtime = process_sp->GetLanguageRuntime(lang.AsLanguageType());
       if (runtime)
         runtime->GetIRPasses(custom_passes);
     }
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
index 5776b1e94e07..5ea7bc02a6e4 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp
@@ -56,6 +56,7 @@
 #include "clang/AST/DeclObjC.h"
 
 #include "llvm/ADT/ScopeExit.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 
 using namespace lldb_private;
 
@@ -63,22 +64,21 @@ char ClangUserExpression::ID;
 
 ClangUserExpression::ClangUserExpression(
     ExecutionContextScope &exe_scope, llvm::StringRef expr,
-    llvm::StringRef prefix, lldb::LanguageType language,
-    ResultType desired_type, const EvaluateExpressionOptions &options,
-    ValueObject *ctx_obj)
+    llvm::StringRef prefix, SourceLanguage language, ResultType desired_type,
+    const EvaluateExpressionOptions &options, ValueObject *ctx_obj)
     : LLVMUserExpression(exe_scope, expr, prefix, language, desired_type,
                          options),
       m_type_system_helper(*m_target_wp.lock(), options.GetExecutionPolicy() ==
                                                     eExecutionPolicyTopLevel),
       m_result_delegate(exe_scope.CalculateTarget()), m_ctx_obj(ctx_obj) {
-  switch (m_language) {
-  case lldb::eLanguageTypeC_plus_plus:
+  switch (m_language.name) {
+  case llvm::dwarf::DW_LNAME_C_plus_plus:
     m_allow_cxx = true;
     break;
-  case lldb::eLanguageTypeObjC:
+  case llvm::dwarf::DW_LNAME_ObjC:
     m_allow_objc = true;
     break;
-  case lldb::eLanguageTypeObjC_plus_plus:
+  case llvm::dwarf::DW_LNAME_ObjC_plus_plus:
   default:
     m_allow_cxx = true;
     m_allow_objc = true;
@@ -624,7 +624,8 @@ bool ClangUserExpression::TryParse(
 void ClangUserExpression::SetupCppModuleImports(ExecutionContext &exe_ctx) {
   Log *log = GetLog(LLDBLog::Expressions);
 
-  CppModuleConfiguration module_config = GetModuleConfig(m_language, exe_ctx);
+  CppModuleConfiguration module_config =
+      GetModuleConfig(m_language.AsLanguageType(), exe_ctx);
   m_imported_cpp_modules = module_config.GetImportedModules();
   m_include_directories = module_config.GetIncludeDirs();
 
@@ -734,7 +735,7 @@ bool ClangUserExpression::Parse(DiagnosticManager &diagnostic_manager,
     if (register_execution_unit) {
       if (auto *persistent_state =
               exe_ctx.GetTargetPtr()->GetPersistentExpressionStateForLanguage(
-                  m_language))
+                  m_language.AsLanguageType()))
         persistent_state->RegisterExecutionUnit(m_execution_unit_sp);
     }
   }
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h
index bc07cbcf9e64..09604feea5de 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.h
@@ -106,8 +106,8 @@ public:
   ///     definitions to be included when the expression is parsed.
   ///
   /// \param[in] language
-  ///     If not eLanguageTypeUnknown, a language to use when parsing
-  ///     the expression.  Currently restricted to those languages
+  ///     If not unknown, a language to use when parsing the
+  ///     expression.  Currently restricted to those languages
   ///     supported by Clang.
   ///
   /// \param[in] desired_type
@@ -122,7 +122,7 @@ public:
   ///     must be evaluated. For details see the comment to
   ///     `UserExpression::Evaluate`.
   ClangUserExpression(ExecutionContextScope &exe_scope, llvm::StringRef expr,
-                      llvm::StringRef prefix, lldb::LanguageType language,
+                      llvm::StringRef prefix, SourceLanguage language,
                       ResultType desired_type,
                       const EvaluateExpressionOptions &options,
                       ValueObject *ctx_obj);
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 8fc0f9103f55..ec005ac46b34 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -9596,7 +9596,7 @@ void ScratchTypeSystemClang::Dump(llvm::raw_ostream &output) {
 }
 
 UserExpression *ScratchTypeSystemClang::GetUserExpression(
-    llvm::StringRef expr, llvm::StringRef prefix, lldb::LanguageType language,
+    llvm::StringRef expr, llvm::StringRef prefix, SourceLanguage language,
     Expression::ResultType desired_type,
     const EvaluateExpressionOptions &options, ValueObject *ctx_obj) {
   TargetSP target_sp = m_target_wp.lock();
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
index 62f14df7638d..8a7d45254bcf 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
@@ -1280,12 +1280,12 @@ public:
   /// \see lldb_private::TypeSystem::Dump
   void Dump(llvm::raw_ostream &output) override;
 
-  UserExpression *
-  GetUserExpression(llvm::StringRef expr, llvm::StringRef prefix,
-                    lldb::LanguageType language,
-                    Expression::ResultType desired_type,
-                    const EvaluateExpressionOptions &options,
-                    ValueObject *ctx_obj) override;
+  UserExpression *GetUserExpression(llvm::StringRef expr,
+                                    llvm::StringRef prefix,
+                                    SourceLanguage language,
+                                    Expression::ResultType desired_type,
+                                    const EvaluateExpressionOptions &options,
+                                    ValueObject *ctx_obj) override;
 
   FunctionCaller *GetFunctionCaller(const CompilerType &return_type,
                                     const Address &function_address,
diff --git a/lldb/source/Target/Language.cpp b/lldb/source/Target/Language.cpp
index 1542c8cb68ce..d0bffe441f63 100644
--- a/lldb/source/Target/Language.cpp
+++ b/lldb/source/Target/Language.cpp
@@ -19,6 +19,7 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/Stream.h"
 
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/Support/Threading.h"
 
 using namespace lldb;
@@ -532,3 +533,36 @@ Language::Language() = default;
 
 // Destructor
 Language::~Language() = default;
+
+SourceLanguage::SourceLanguage(lldb::LanguageType language_type) {
+  auto lname =
+      llvm::dwarf::toDW_LNAME((llvm::dwarf::SourceLanguage)language_type);
+  if (!lname)
+    return;
+  name = lname->first;
+  version = lname->second;
+}
+
+lldb::LanguageType SourceLanguage::AsLanguageType() const {
+  if (auto lang = llvm::dwarf::toDW_LANG((llvm::dwarf::SourceLanguageName)name,
+                                         version))
+    return (lldb::LanguageType)*lang;
+  return lldb::eLanguageTypeUnknown;
+}
+
+llvm::StringRef SourceLanguage::GetDescription() const {
+  LanguageType type = AsLanguageType();
+  if (type)
+    return Language::GetNameForLanguageType(type);
+  return llvm::dwarf::LanguageDescription(
+      (llvm::dwarf::SourceLanguageName)name);
+}
+bool SourceLanguage::IsC() const { return name == llvm::dwarf::DW_LNAME_C; }
+
+bool SourceLanguage::IsObjC() const {
+  return name == llvm::dwarf::DW_LNAME_ObjC;
+}
+
+bool SourceLanguage::IsCPlusPlus() const {
+  return name == llvm::dwarf::DW_LNAME_C_plus_plus;
+}
diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp
index 03a74f29e76e..246871d5abaa 100644
--- a/lldb/source/Target/StackFrame.cpp
+++ b/lldb/source/Target/StackFrame.cpp
@@ -1203,26 +1203,23 @@ bool StackFrame::IsArtificial() const {
   return m_stack_frame_kind == StackFrame::Kind::Artificial;
 }
 
-lldb::LanguageType StackFrame::GetLanguage() {
+SourceLanguage StackFrame::GetLanguage() {
   CompileUnit *cu = GetSymbolContext(eSymbolContextCompUnit).comp_unit;
   if (cu)
     return cu->GetLanguage();
-  return lldb::eLanguageTypeUnknown;
+  return {};
 }
 
-lldb::LanguageType StackFrame::GuessLanguage() {
-  LanguageType lang_type = GetLanguage();
+SourceLanguage StackFrame::GuessLanguage() {
+  SourceLanguage lang_type = GetLanguage();
 
   if (lang_type == eLanguageTypeUnknown) {
-    SymbolContext sc = GetSymbolContext(eSymbolContextFunction
-                                        | eSymbolContextSymbol);
-    if (sc.function) {
-      lang_type = sc.function->GetMangled().GuessLanguage();
-    }
+    SymbolContext sc =
+        GetSymbolContext(eSymbolContextFunction | eSymbolContextSymbol);
+    if (sc.function)
+      lang_type = LanguageType(sc.function->GetMangled().GuessLanguage());
     else if (sc.symbol)
-    {
-      lang_type = sc.symbol->GetMangled().GuessLanguage();
-    }
+      lang_type = SourceLanguage(sc.symbol->GetMangled().GuessLanguage());
   }
 
   return lang_type;
@@ -1302,7 +1299,7 @@ GetBaseExplainingDereference(const Instruction::Operand &operand,
   }
   return std::make_pair(nullptr, 0);
 }
-}
+} // namespace
 
 lldb::ValueObjectSP StackFrame::GuessValueForAddress(lldb::addr_t addr) {
   TargetSP target_sp = CalculateTarget();
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index aa4895bb5a6d..82f3040e539a 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -504,7 +504,7 @@ BreakpointSP Target::CreateBreakpoint(
     if (skip_prologue == eLazyBoolCalculate)
       skip_prologue = GetSkipPrologue() ? eLazyBoolYes : eLazyBoolNo;
     if (language == lldb::eLanguageTypeUnknown)
-      language = GetLanguage();
+      language = GetLanguage().AsLanguageType();
 
     BreakpointResolverSP resolver_sp(new BreakpointResolverName(
         nullptr, func_name, func_name_type_mask, language, Breakpoint::Exact,
@@ -530,7 +530,7 @@ Target::CreateBreakpoint(const FileSpecList *containingModules,
     if (skip_prologue == eLazyBoolCalculate)
       skip_prologue = GetSkipPrologue() ? eLazyBoolYes : eLazyBoolNo;
     if (language == lldb::eLanguageTypeUnknown)
-      language = GetLanguage();
+      language = GetLanguage().AsLanguageType();
 
     BreakpointResolverSP resolver_sp(
         new BreakpointResolverName(nullptr, func_names, func_name_type_mask,
@@ -559,7 +559,7 @@ Target::CreateBreakpoint(const FileSpecList *containingModules,
         skip_prologue = eLazyBoolNo;
     }
     if (language == lldb::eLanguageTypeUnknown)
-      language = GetLanguage();
+      language = GetLanguage().AsLanguageType();
 
     BreakpointResolverSP resolver_sp(new BreakpointResolverName(
         nullptr, func_names, num_names, func_name_type_mask, language, offset,
@@ -2504,15 +2504,16 @@ Target::GetPersistentExpressionStateForLanguage(lldb::LanguageType language) {
 }
 
 UserExpression *Target::GetUserExpressionForLanguage(
-    llvm::StringRef expr, llvm::StringRef prefix, lldb::LanguageType language,
+    llvm::StringRef expr, llvm::StringRef prefix, SourceLanguage language,
     Expression::ResultType desired_type,
     const EvaluateExpressionOptions &options, ValueObject *ctx_obj,
     Status &error) {
-  auto type_system_or_err = GetScratchTypeSystemForLanguage(language);
+  auto type_system_or_err =
+      GetScratchTypeSystemForLanguage(language.AsLanguageType());
   if (auto err = type_system_or_err.takeError()) {
     error.SetErrorStringWithFormat(
         "Could not find type system for language %s: %s",
-        Language::GetNameForLanguageType(language),
+        Language::GetNameForLanguageType(language.AsLanguageType()),
         llvm::toString(std::move(err)).c_str());
     return nullptr;
   }
@@ -2521,7 +2522,7 @@ UserExpression *Target::GetUserExpressionForLanguage(
   if (!ts) {
     error.SetErrorStringWithFormat(
         "Type system for language %s is no longer live",
-        Language::GetNameForLanguageType(language));
+        language.GetDescription().data());
     return nullptr;
   }
 
@@ -2530,7 +2531,7 @@ UserExpression *Target::GetUserExpressionForLanguage(
   if (!user_expr)
     error.SetErrorStringWithFormat(
         "Could not create an expression for language %s",
-        Language::GetNameForLanguageType(language));
+        language.GetDescription().data());
 
   return user_expr;
 }
@@ -4646,9 +4647,9 @@ void TargetProperties::SetStandardErrorPath(llvm::StringRef path) {
   SetPropertyAtIndex(idx, path);
 }
 
-LanguageType TargetProperties::GetLanguage() const {
+SourceLanguage TargetProperties::GetLanguage() const {
   const uint32_t idx = ePropertyLanguage;
-  return GetPropertyAtIndexAs<LanguageType>(idx, {});
+  return {GetPropertyAtIndexAs<LanguageType>(idx, {})};
 }
 
 llvm::StringRef TargetProperties::GetExpressionPrefixContents() {
diff --git a/lldb/test/API/driver/batch_mode/TestBatchMode.py b/lldb/test/API/driver/batch_mode/TestBatchMode.py
index 642dd47c6d45..bc6f2daebbab 100644
--- a/lldb/test/API/driver/batch_mode/TestBatchMode.py
+++ b/lldb/test/API/driver/batch_mode/TestBatchMode.py
@@ -13,6 +13,7 @@ from lldbsuite.test.lldbpexpect import PExpectTest
 class DriverBatchModeTest(PExpectTest):
     source = "main.c"
 
+    @skipIf(macos_version=["<", "14.0"], asan=True)
     @skipIf(oslist=["linux"], archs=["arm", "aarch64"])  # Randomly fails on buildbot
     @expectedFlakeyFreeBSD("llvm.org/pr25172 fails rarely on the buildbot")
     def test_batch_mode_run_crash(self):
@@ -50,6 +51,7 @@ class DriverBatchModeTest(PExpectTest):
         self.expect_prompt()
         self.expect("frame variable touch_me_not", substrs=["(char *) touch_me_not"])
 
+    @skipIf(macos_version=["<", "14.0"], asan=True)
     @skipIf(oslist=["linux"], archs=["arm", "aarch64"])  # Randomly fails on buildbot
     @expectedFlakeyFreeBSD("llvm.org/pr25172 fails rarely on the buildbot")
     def test_batch_mode_run_exit(self):
@@ -86,6 +88,7 @@ class DriverBatchModeTest(PExpectTest):
 
         child.expect(pexpect.EOF)
 
+    @skipIf(macos_version=["<", "14.0"], asan=True)
     @skipIf(oslist=["linux"], archs=["arm", "aarch64"])  # Randomly fails on buildbot
     @expectedFlakeyFreeBSD("llvm.org/pr25172 fails rarely on the buildbot")
     def test_batch_mode_launch_stop_at_entry(self):
@@ -125,6 +128,7 @@ class DriverBatchModeTest(PExpectTest):
             self.victim.close()
             self.victim = None
 
+    @skipIf(macos_version=["<", "14.0"], asan=True)
     @skipIf(oslist=["linux"], archs=["arm", "aarch64"])  # Randomly fails on buildbot
     @expectedFlakeyFreeBSD("llvm.org/pr25172 fails rarely on the buildbot")
     @expectedFailureNetBSD
diff --git a/lldb/test/API/driver/job_control/TestJobControl.py b/lldb/test/API/driver/job_control/TestJobControl.py
index 1a1739f4cb39..648acb1d4730 100644
--- a/lldb/test/API/driver/job_control/TestJobControl.py
+++ b/lldb/test/API/driver/job_control/TestJobControl.py
@@ -8,6 +8,7 @@ from lldbsuite.test.lldbpexpect import PExpectTest
 
 
 class JobControlTest(PExpectTest):
+    @skipIf(macos_version=["<", "14.0"], asan=True)
     @skipIf(oslist=["linux"], archs=["arm", "aarch64"])
     def test_job_control(self):
         def post_spawn():
diff --git a/lldb/test/API/driver/quit_speed/TestQuitWithProcess.py b/lldb/test/API/driver/quit_speed/TestQuitWithProcess.py
index 42527c88b992..c75ac977ea20 100644
--- a/lldb/test/API/driver/quit_speed/TestQuitWithProcess.py
+++ b/lldb/test/API/driver/quit_speed/TestQuitWithProcess.py
@@ -31,4 +31,4 @@ class DriverQuitSpeedTest(PExpectTest):
         print("Got launch message")
         child.sendline("quit")
         print("sent quit")
-        child.expect(pexpect.EOF, timeout=15)
+        child.expect(pexpect.EOF, timeout=60)
diff --git a/lldb/test/API/functionalities/fork/concurrent_vfork/TestConcurrentVFork.py b/lldb/test/API/functionalities/fork/concurrent_vfork/TestConcurrentVFork.py
index 2dcbb728549f..dd9500c186b2 100644
--- a/lldb/test/API/functionalities/fork/concurrent_vfork/TestConcurrentVFork.py
+++ b/lldb/test/API/functionalities/fork/concurrent_vfork/TestConcurrentVFork.py
@@ -48,6 +48,8 @@ class TestConcurrentVFork(TestBase):
         self.expect("continue", patterns=[r"exited with status = 1[0-4]"])
 
     @skipUnlessPlatform(["linux"])
+    # https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_parent_vfork_no_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-parent.
@@ -56,6 +58,8 @@ class TestConcurrentVFork(TestBase):
         self.follow_parent_helper(use_fork=False, call_exec=False)
 
     @skipUnlessPlatform(["linux"])
+    # https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_parent_fork_no_exec(self):
         """
         Make sure that debugging concurrent fork() from multiple threads won't crash lldb during follow-parent.
@@ -64,6 +68,8 @@ class TestConcurrentVFork(TestBase):
         self.follow_parent_helper(use_fork=True, call_exec=False)
 
     @skipUnlessPlatform(["linux"])
+    # https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_parent_vfork_call_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-parent.
@@ -72,6 +78,8 @@ class TestConcurrentVFork(TestBase):
         self.follow_parent_helper(use_fork=False, call_exec=True)
 
     @skipUnlessPlatform(["linux"])
+    # https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_parent_fork_call_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-parent.
@@ -80,6 +88,8 @@ class TestConcurrentVFork(TestBase):
         self.follow_parent_helper(use_fork=True, call_exec=True)
 
     @skipUnlessPlatform(["linux"])
+    # https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_child_vfork_no_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-child.
@@ -88,6 +98,8 @@ class TestConcurrentVFork(TestBase):
         self.follow_child_helper(use_fork=False, call_exec=False)
 
     @skipUnlessPlatform(["linux"])
+    # https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_child_fork_no_exec(self):
         """
         Make sure that debugging concurrent fork() from multiple threads won't crash lldb during follow-child.
@@ -96,6 +108,8 @@ class TestConcurrentVFork(TestBase):
         self.follow_child_helper(use_fork=True, call_exec=False)
 
     @skipUnlessPlatform(["linux"])
+    # https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_child_vfork_call_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-child.
@@ -104,6 +118,8 @@ class TestConcurrentVFork(TestBase):
         self.follow_child_helper(use_fork=False, call_exec=True)
 
     @skipUnlessPlatform(["linux"])
+    # https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_child_fork_call_exec(self):
         """
         Make sure that debugging concurrent fork() from multiple threads won't crash lldb during follow-child.
diff --git a/lldb/test/API/iohandler/sigint/TestProcessIOHandlerInterrupt.py b/lldb/test/API/iohandler/sigint/TestProcessIOHandlerInterrupt.py
index 8e19d56cd0c2..75ac0f6c0289 100644
--- a/lldb/test/API/iohandler/sigint/TestProcessIOHandlerInterrupt.py
+++ b/lldb/test/API/iohandler/sigint/TestProcessIOHandlerInterrupt.py
@@ -11,6 +11,7 @@ from lldbsuite.test.lldbpexpect import PExpectTest
 
 
 class TestCase(PExpectTest):
+    @skipIf(macos_version=["<", "14.0"], asan=True)
     @skipIf(compiler="clang", compiler_version=["<", "11.0"])
     @skipIf(oslist=["linux"], archs=["arm", "aarch64"])
     def test(self):
diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py
index 9d6775917e13..d934349fe3ca 100644
--- a/lldb/test/API/lit.cfg.py
+++ b/lldb/test/API/lit.cfg.py
@@ -239,6 +239,9 @@ if is_configured("llvm_tools_dir"):
 if is_configured("server"):
     dotest_cmd += ["--server", config.server]
 
+if is_configured("lldb_obj_root"):
+    dotest_cmd += ["--lldb-obj-root", config.lldb_obj_root]
+
 if is_configured("lldb_libs_dir"):
     dotest_cmd += ["--lldb-libs-dir", config.lldb_libs_dir]
 
diff --git a/lldb/test/API/lit.site.cfg.py.in b/lldb/test/API/lit.site.cfg.py.in
index 053331dc4881..8b2d09ae41cd 100644
--- a/lldb/test/API/lit.site.cfg.py.in
+++ b/lldb/test/API/lit.site.cfg.py.in
@@ -8,7 +8,7 @@ config.llvm_include_dir = lit_config.substitute("@LLVM_INCLUDE_DIR@")
 config.llvm_shlib_dir = lit_config.substitute("@SHLIBDIR@")
 config.llvm_build_mode = lit_config.substitute("@LLVM_BUILD_MODE@")
 config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
-config.lldb_obj_root = "@LLDB_BINARY_DIR@"
+config.lldb_obj_root = lit_config.substitute("@LLDB_BINARY_DIR@")
 config.lldb_src_root = "@LLDB_SOURCE_DIR@"
 config.lldb_libs_dir = lit_config.substitute("@LLDB_LIBS_DIR@")
 config.lldb_framework_dir = lit_config.substitute("@LLDB_FRAMEWORK_DIR@")
diff --git a/lldb/test/API/macosx/nslog/TestDarwinNSLogOutput.py b/lldb/test/API/macosx/nslog/TestDarwinNSLogOutput.py
index 15d9feb54389..f6bda16560b9 100644
--- a/lldb/test/API/macosx/nslog/TestDarwinNSLogOutput.py
+++ b/lldb/test/API/macosx/nslog/TestDarwinNSLogOutput.py
@@ -20,8 +20,6 @@ from lldbsuite.test import lldbtest_config
 class DarwinNSLogOutputTestCase(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
-    @skipUnlessDarwin
-    @skipIfRemote  # this test is currently written using lldb commands & assumes running on local system
     def setUp(self):
         # Call super's setUp().
         TestBase.setUp(self)
@@ -119,6 +117,9 @@ class DarwinNSLogOutputTestCase(TestBase):
         self.runCmd("process continue")
         self.expect(expect_regexes)
 
+    @skipIf(oslist=["linux"], archs=["arm", "aarch64"])
+    @skipUnlessDarwin
+    @skipIfRemote  # this test is currently written using lldb commands & assumes running on local system
     def test_nslog_output_is_displayed(self):
         """Test that NSLog() output shows up in the command-line debugger."""
         self.do_test(
@@ -131,6 +132,9 @@ class DarwinNSLogOutputTestCase(TestBase):
         self.assertGreater(len(self.child.match.groups()), 0)
         self.assertEqual("This is a message from NSLog", self.child.match.group(1))
 
+    @skipIf(oslist=["linux"], archs=["arm", "aarch64"])
+    @skipUnlessDarwin
+    @skipIfRemote  # this test is currently written using lldb commands & assumes running on local system
     def test_nslog_output_is_suppressed_with_env_var(self):
         """Test that NSLog() output does not show up with the ignore env var."""
         # This test will only work properly on macOS 10.12+.  Skip it on earlier versions.
diff --git a/lldb/test/API/terminal/TestSTTYBeforeAndAfter.py b/lldb/test/API/terminal/TestSTTYBeforeAndAfter.py
index 21aca5fc85d5..313a265319db 100644
--- a/lldb/test/API/terminal/TestSTTYBeforeAndAfter.py
+++ b/lldb/test/API/terminal/TestSTTYBeforeAndAfter.py
@@ -19,6 +19,7 @@ class TestSTTYBeforeAndAfter(TestBase):
         cls.RemoveTempFile("child_send2.txt")
         cls.RemoveTempFile("child_read2.txt")
 
+    @skipIf(macos_version=["<", "14.0"], asan=True)
     @add_test_categories(["pexpect"])
     @no_debug_info_test
     def test_stty_dash_a_before_and_afetr_invoking_lldb_command(self):
diff --git a/lldb/utils/TableGen/CMakeLists.txt b/lldb/utils/TableGen/CMakeLists.txt
index 47a6400b4287..68547fe13e1a 100644
--- a/lldb/utils/TableGen/CMakeLists.txt
+++ b/lldb/utils/TableGen/CMakeLists.txt
@@ -10,6 +10,7 @@ if (NOT DEFINED LLDB_TABLEGEN_EXE)
     add_tablegen(lldb-tblgen LLDB
       LLDBOptionDefEmitter.cpp
       LLDBPropertyDefEmitter.cpp
+      LLDBSBAPIDWARFEnum.cpp
       LLDBTableGen.cpp
       LLDBTableGenUtils.cpp
       )
diff --git a/lldb/utils/TableGen/LLDBSBAPIDWARFEnum.cpp b/lldb/utils/TableGen/LLDBSBAPIDWARFEnum.cpp
new file mode 100644
index 000000000000..084284ed6aa8
--- /dev/null
+++ b/lldb/utils/TableGen/LLDBSBAPIDWARFEnum.cpp
@@ -0,0 +1,67 @@
+//===- LLDBPropertyDefEmitter.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Produce the list of source languages header file fragment for the SBAPI.
+//
+//===----------------------------------------------------------------------===//
+
+#include <fstream>
+#include <llvm/ADT/StringRef.h>
+#include <regex>
+
+namespace lldb_private {
+int EmitSBAPIDWARFEnum(int argc, char **argv) {
+  std::string InputFilename;
+  std::string OutputFilename;
+  std::string DepFilename;
+  // This command line option parser is as robust as the worst shell script.
+  for (int i = 0; i < argc; ++i) {
+    if (llvm::StringRef(argv[i]).ends_with("Dwarf.def"))
+      InputFilename = std::string(argv[i]);
+    if (llvm::StringRef(argv[i]) == "-o" && i + 1 < argc)
+      OutputFilename = std::string(argv[i + 1]);
+    if (llvm::StringRef(argv[i]) == "-d" && i + 1 < argc)
+      DepFilename = std::string(argv[i + 1]);
+  }
+  std::ifstream input(InputFilename);
+  std::ofstream output(OutputFilename);
+  output
+      << R"(//===-- SBLanguages.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_API_SBLANGUAGE_H
+#define LLDB_API_SBLANGUAGE_H
+/// Used by \ref SBExpressionOptions.
+/// These enumerations use the same language enumerations as the DWARF
+/// specification for ease of use and consistency.
+enum SBSourceLanguageName : uint16_t {
+)";
+  std::string line;
+  std::regex macro_regex(R"(^ *HANDLE_DW_LNAME *\( *([^,]+), ([^,]+), )"
+                         "\"(.*)\",.*\\).*",
+                         std::regex::extended);
+  while (std::getline(input, line)) {
+    std::smatch match;
+    if (!std::regex_match(line, match, macro_regex))
+      continue;
+
+    output << "  /// " << match[3] << ".\n";
+    output << "  eLanguageName" << match[2] << " = " << match[1] << ",\n";
+  }
+  output << "};\n\n";
+  output << "#endif\n";
+  // Emit the dependencies file.
+  std::ofstream(DepFilename) << OutputFilename << ": " << InputFilename << '\n';
+  return 0;
+}
+} // namespace lldb_private
diff --git a/lldb/utils/TableGen/LLDBTableGen.cpp b/lldb/utils/TableGen/LLDBTableGen.cpp
index abb6589f0ca6..75d167556047 100644
--- a/lldb/utils/TableGen/LLDBTableGen.cpp
+++ b/lldb/utils/TableGen/LLDBTableGen.cpp
@@ -27,6 +27,7 @@ enum ActionType {
   GenOptionDefs,
   GenPropertyDefs,
   GenPropertyEnumDefs,
+  GenSBAPIDWARFEnum
 };
 
 static cl::opt<ActionType> Action(
@@ -40,6 +41,8 @@ static cl::opt<ActionType> Action(
                clEnumValN(GenPropertyDefs, "gen-lldb-property-defs",
                           "Generate lldb property definitions"),
                clEnumValN(GenPropertyEnumDefs, "gen-lldb-property-enum-defs",
+                          "Generate lldb property enum definitions"),
+               clEnumValN(GenSBAPIDWARFEnum, "gen-lldb-sbapi-dwarf-enum",
                           "Generate lldb property enum definitions")));
 
 static bool LLDBTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
@@ -59,6 +62,8 @@ static bool LLDBTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   case GenPropertyEnumDefs:
     EmitPropertyEnumDefs(Records, OS);
     break;
+  case GenSBAPIDWARFEnum:
+    llvm_unreachable("already handled");
   }
   return false;
 }
@@ -67,9 +72,11 @@ int main(int argc, char **argv) {
   sys::PrintStackTraceOnErrorSignal(argv[0]);
   PrettyStackTraceProgram X(argc, argv);
   cl::ParseCommandLineOptions(argc, argv);
-
   llvm_shutdown_obj Y;
 
+  if (Action == GenSBAPIDWARFEnum)
+    return EmitSBAPIDWARFEnum(argc, argv);
+
   return TableGenMain(argv[0], &LLDBTableGenMain);
 }
 
diff --git a/lldb/utils/TableGen/LLDBTableGenBackends.h b/lldb/utils/TableGen/LLDBTableGenBackends.h
index 88ae0888c22d..b60c4705de3a 100644
--- a/lldb/utils/TableGen/LLDBTableGenBackends.h
+++ b/lldb/utils/TableGen/LLDBTableGenBackends.h
@@ -32,6 +32,7 @@ namespace lldb_private {
 void EmitOptionDefs(RecordKeeper &RK, raw_ostream &OS);
 void EmitPropertyDefs(RecordKeeper &RK, raw_ostream &OS);
 void EmitPropertyEnumDefs(RecordKeeper &RK, raw_ostream &OS);
+int EmitSBAPIDWARFEnum(int argc, char **argv);
 
 } // namespace lldb_private
 
diff --git a/llvm/cmake/modules/llvm-driver-template.cpp.in b/llvm/cmake/modules/llvm-driver-template.cpp.in
index 71aca6cd140c..1470ef1f0616 100644
--- a/llvm/cmake/modules/llvm-driver-template.cpp.in
+++ b/llvm/cmake/modules/llvm-driver-template.cpp.in
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/LLVMDriver.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/LLVMDriver.h"
 
 int @TOOL_NAME@_main(int argc, char **, const llvm::ToolContext &);
 
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index f169ab941c45..37662f79145d 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -18805,7 +18805,7 @@ runtime, then the result vector is a :ref:`poison value <poisonvalues>`. The
 ``idx`` parameter must be a vector index constant type (for most targets this
 will be an integer pointer type).
 
-'``llvm.experimental.vector.reverse``' Intrinsic
+'``llvm.vector.reverse``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
@@ -18814,25 +18814,26 @@ This is an overloaded intrinsic.
 
 ::
 
-      declare <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8> %a)
-      declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+      declare <2 x i8> @llvm.vector.reverse.v2i8(<2 x i8> %a)
+      declare <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reverse.*``' intrinsics reverse a vector.
+The '``llvm.vector.reverse.*``' intrinsics reverse a vector.
 The intrinsic takes a single vector and returns a vector of matching type but
 with the original lane order reversed. These intrinsics work for both fixed
-and scalable vectors. While this intrinsic is marked as experimental the
-recommended way to express reverse operations for fixed-width vectors is still
-to use a shufflevector, as that may allow for more optimization opportunities.
+and scalable vectors. While this intrinsic supports all vector types
+the recommended way to express this operation for fixed-width vectors is
+still to use a shufflevector, as that may allow for more optimization
+opportunities.
 
 Arguments:
 """"""""""
 
 The argument to this intrinsic must be a vector.
 
-'``llvm.experimental.vector.deinterleave2``' Intrinsic
+'``llvm.vector.deinterleave2``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
@@ -18841,13 +18842,13 @@ This is an overloaded intrinsic.
 
 ::
 
-      declare {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec1)
-      declare {<vscale x 4 x i32>, <vscale x 4 x i32>}  @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec1)
+      declare {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double> %vec1)
+      declare {<vscale x 4 x i32>, <vscale x 4 x i32>}  @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec1)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.deinterleave2``' intrinsic constructs two
+The '``llvm.vector.deinterleave2``' intrinsic constructs two
 vectors by deinterleaving the even and odd lanes of the input vector.
 
 This intrinsic works for both fixed and scalable vectors. While this intrinsic
@@ -18859,7 +18860,7 @@ For example:
 
 .. code-block:: text
 
-  {<2 x i64>, <2 x i64>} llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> <i64 0, i64 1, i64 2, i64 3>); ==> {<2 x i64> <i64 0, i64 2>, <2 x i64> <i64 1, i64 3>}
+  {<2 x i64>, <2 x i64>} llvm.vector.deinterleave2.v4i64(<4 x i64> <i64 0, i64 1, i64 2, i64 3>); ==> {<2 x i64> <i64 0, i64 2>, <2 x i64> <i64 1, i64 3>}
 
 Arguments:
 """"""""""
@@ -18867,7 +18868,7 @@ Arguments:
 The argument is a vector whose type corresponds to the logical concatenation of
 the two result types.
 
-'``llvm.experimental.vector.interleave2``' Intrinsic
+'``llvm.vector.interleave2``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
@@ -18876,13 +18877,13 @@ This is an overloaded intrinsic.
 
 ::
 
-      declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> %vec1, <2 x double> %vec2)
-      declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2)
+      declare <4 x double> @llvm.vector.interleave2.v4f64(<2 x double> %vec1, <2 x double> %vec2)
+      declare <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.interleave2``' intrinsic constructs a vector
+The '``llvm.vector.interleave2``' intrinsic constructs a vector
 by interleaving two input vectors.
 
 This intrinsic works for both fixed and scalable vectors. While this intrinsic
@@ -18894,7 +18895,7 @@ For example:
 
 .. code-block:: text
 
-   <4 x i64> llvm.experimental.vector.interleave2.v4i64(<2 x i64> <i64 0, i64 2>, <2 x i64> <i64 1, i64 3>); ==> <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+   <4 x i64> llvm.vector.interleave2.v4i64(<2 x i64> <i64 0, i64 2>, <2 x i64> <i64 1, i64 3>); ==> <4 x i64> <i64 0, i64 1, i64 2, i64 3>
 
 Arguments:
 """"""""""
@@ -18940,7 +18941,7 @@ The '``llvm.experimental.cttz.elts``' intrinsic counts the trailing (least
 significant) zero elements in a vector. If ``src == 0`` the result is the
 number of elements in the input vector.
 
-'``llvm.experimental.vector.splice``' Intrinsic
+'``llvm.vector.splice``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
@@ -18949,13 +18950,13 @@ This is an overloaded intrinsic.
 
 ::
 
-      declare <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %vec1, <2 x double> %vec2, i32 %imm)
-      declare <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2, i32 %imm)
+      declare <2 x double> @llvm.vector.splice.v2f64(<2 x double> %vec1, <2 x double> %vec2, i32 %imm)
+      declare <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2, i32 %imm)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.splice.*``' intrinsics construct a vector by
+The '``llvm.vector.splice.*``' intrinsics construct a vector by
 concatenating elements from the first input vector with elements of the second
 input vector, returning a vector of the same type as the input vectors. The
 signed immediate, modulo the number of elements in the vector, is the index
@@ -18966,7 +18967,7 @@ immediate, it extracts ``-imm`` trailing elements from the first vector, and
 the remaining elements from ``%vec2``.
 
 These intrinsics work for both fixed and scalable vectors. While this intrinsic
-is marked as experimental, the recommended way to express this operation for
+supports all vector types the recommended way to express this operation for
 fixed-width vectors is still to use a shufflevector, as that may allow for more
 optimization opportunities.
 
@@ -18974,8 +18975,8 @@ For example:
 
 .. code-block:: text
 
- llvm.experimental.vector.splice(<A,B,C,D>, <E,F,G,H>, 1);  ==> <B, C, D, E> index
- llvm.experimental.vector.splice(<A,B,C,D>, <E,F,G,H>, -3); ==> <B, C, D, E> trailing elements
+ llvm.vector.splice(<A,B,C,D>, <E,F,G,H>, 1);  ==> <B, C, D, E> index
+ llvm.vector.splice(<A,B,C,D>, <E,F,G,H>, -3); ==> <B, C, D, E> trailing elements
 
 
 Arguments:
@@ -22198,7 +22199,7 @@ Overview:
 """""""""
 
 The '``llvm.experimental.vp.splice.*``' intrinsic is the vector length
-predicated version of the '``llvm.experimental.vector.splice.*``' intrinsic.
+predicated version of the '``llvm.vector.splice.*``' intrinsic.
 
 Arguments:
 """"""""""
@@ -22257,7 +22258,7 @@ Overview:
 """""""""
 
 The '``llvm.experimental.vp.reverse.*``' intrinsic is the vector length
-predicated version of the '``llvm.experimental.vector.reverse.*``' intrinsic.
+predicated version of the '``llvm.vector.reverse.*``' intrinsic.
 
 Arguments:
 """"""""""
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 64a698325212..46d79d6c5822 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -50,7 +50,11 @@ Update on required toolchains to build LLVM
 Changes to the LLVM IR
 ----------------------
 
-- Added Memory Model Relaxation Annotations (MMRAs).
+* Added Memory Model Relaxation Annotations (MMRAs).
+* Renamed ``llvm.experimental.vector.reverse`` intrinsic to ``llvm.vector.reverse``.
+* Renamed ``llvm.experimental.vector.splice`` intrinsic to ``llvm.vector.splice``.
+* Renamed ``llvm.experimental.vector.interleave2`` intrinsic to ``llvm.vector.interleave2``.
+* Renamed ``llvm.experimental.vector.deinterleave2`` intrinsic to ``llvm.vector.deinterleave2``.
 
 Changes to LLVM infrastructure
 ------------------------------
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index 571e44cdac26..afd18e7e56ba 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -860,7 +860,8 @@ enum class OverflowResult {
 };
 
 OverflowResult computeOverflowForUnsignedMul(const Value *LHS, const Value *RHS,
-                                             const SimplifyQuery &SQ);
+                                             const SimplifyQuery &SQ,
+                                             bool IsNSW = false);
 OverflowResult computeOverflowForSignedMul(const Value *LHS, const Value *RHS,
                                            const SimplifyQuery &SQ);
 OverflowResult
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 56b5d4e399c6..f296acc2ca4b 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -1939,11 +1939,12 @@ uint16_t convertArchNameToEMachine(StringRef Arch);
 /// Convert an ELF's e_machine value into an architecture name.
 StringRef convertEMachineToArchName(uint16_t EMachine);
 
-/// Convert a OS into ELF's EI_OSABI value.
-uint8_t convertOSToOSAbi(StringRef OS);
+// Convert a lowercase string identifier into an OSABI value.
+uint8_t convertNameToOSABI(StringRef Name);
 
-/// Convert an ELF's e_machine value into an architecture name.
-StringRef convertOSAbiToOS(uint8_t OSAbi);
+// Convert an OSABI value into a string that identifies the OS- or ABI-
+// specific ELF extension.
+StringRef convertOSABIToName(uint8_t OSABI);
 
 } // end namespace ELF
 } // end namespace llvm
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 4a3a03dc5ad4..92b51438b4cb 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1662,12 +1662,12 @@ public:
           TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()),
           std::nullopt, CostKind, Index, cast<VectorType>(Args[1]->getType()));
     }
-    case Intrinsic::experimental_vector_reverse: {
+    case Intrinsic::vector_reverse: {
       return thisT()->getShuffleCost(
           TTI::SK_Reverse, cast<VectorType>(Args[0]->getType()), std::nullopt,
           CostKind, 0, cast<VectorType>(RetTy));
     }
-    case Intrinsic::experimental_vector_splice: {
+    case Intrinsic::vector_splice: {
       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
       return thisT()->getShuffleCost(
           TTI::SK_Splice, cast<VectorType>(Args[0]->getType()), std::nullopt,
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 5f28908e998a..deae2c55d26e 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -247,8 +247,8 @@ private:
   bool translateTrap(const CallInst &U, MachineIRBuilder &MIRBuilder,
                      unsigned Opcode);
 
-  // Translate @llvm.experimental.vector.interleave2 and
-  // @llvm.experimental.vector.deinterleave2 intrinsics for fixed-width vector
+  // Translate @llvm.vector.interleave2 and
+  // @llvm.vector.deinterleave2 intrinsics for fixed-width vector
   // types into vector shuffles.
   bool translateVectorInterleave2Intrinsic(const CallInst &CI,
                                            MachineIRBuilder &MIRBuilder);
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 2dd978c7b584..661b2841c6ac 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3146,7 +3146,7 @@ public:
 
   /// Lower a deinterleave intrinsic to a target specific load intrinsic.
   /// Return true on success. Currently only supports
-  /// llvm.experimental.vector.deinterleave2
+  /// llvm.vector.deinterleave2
   ///
   /// \p DI is the deinterleave intrinsic.
   /// \p LI is the accompanying load instruction
@@ -3157,7 +3157,7 @@ public:
 
   /// Lower an interleave intrinsic to a target specific store intrinsic.
   /// Return true on success. Currently only supports
-  /// llvm.experimental.vector.interleave2
+  /// llvm.vector.interleave2
   ///
   /// \p II is the interleave intrinsic.
   /// \p SI is the accompanying store instruction
@@ -5238,6 +5238,9 @@ public:
   /// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
   SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const;
 
+  /// Expand fminimum/fmaximum into multiple comparison with selects.
+  SDValue expandFMINIMUM_FMAXIMUM(SDNode *N, SelectionDAG &DAG) const;
+
   /// Expand FP_TO_[US]INT_SAT into FP_TO_[US]INT and selects or min/max.
   /// \param N Node to expand
   /// \returns The expansion result
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index a14e9dedef8c..a2678d69ce40 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2577,15 +2577,15 @@ def int_preserve_static_offset : DefaultAttrsIntrinsic<[llvm_ptr_ty],
 
 //===------------ Intrinsics to perform common vector shuffles ------------===//
 
-def int_experimental_vector_reverse : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                                                            [LLVMMatchType<0>],
-                                                            [IntrNoMem]>;
+def int_vector_reverse : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                               [LLVMMatchType<0>],
+                                               [IntrNoMem]>;
 
-def int_experimental_vector_splice : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                                                           [LLVMMatchType<0>,
-                                                            LLVMMatchType<0>,
-                                                            llvm_i32_ty],
-                                                           [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+def int_vector_splice : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                              [LLVMMatchType<0>,
+                                               LLVMMatchType<0>,
+                                               llvm_i32_ty],
+                                              [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 //===---------- Intrinsics to query properties of scalable vectors --------===//
 def int_vscale : DefaultAttrsIntrinsic<[llvm_anyint_ty], [], [IntrNoMem]>;
@@ -2600,15 +2600,15 @@ def int_vector_extract : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                                                [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<1>>]>;
 
 
-def int_experimental_vector_interleave2   : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                                                                  [LLVMHalfElementsVectorType<0>,
-                                                                   LLVMHalfElementsVectorType<0>],
-                                                                  [IntrNoMem]>;
+def int_vector_interleave2   : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                                     [LLVMHalfElementsVectorType<0>,
+                                                      LLVMHalfElementsVectorType<0>],
+                                                     [IntrNoMem]>;
 
-def int_experimental_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType<0>,
-                                                                   LLVMHalfElementsVectorType<0>],
-                                                                  [llvm_anyvector_ty],
-                                                                  [IntrNoMem]>;
+def int_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType<0>,
+                                                      LLVMHalfElementsVectorType<0>],
+                                                     [llvm_anyvector_ty],
+                                                     [IntrNoMem]>;
 
 //===----------------- Pointer Authentication Intrinsics ------------------===//
 //
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 0b13b4aad9c3..739208e74dcb 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -2513,7 +2513,7 @@ inline typename m_Intrinsic_Ty<Opnd0, Opnd1>::Ty m_CopySign(const Opnd0 &Op0,
 
 template <typename Opnd0>
 inline typename m_Intrinsic_Ty<Opnd0>::Ty m_VecReverse(const Opnd0 &Op0) {
-  return m_Intrinsic<Intrinsic::experimental_vector_reverse>(Op0);
+  return m_Intrinsic<Intrinsic::vector_reverse>(Op0);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/Support/YAMLTraits.h b/llvm/include/llvm/Support/YAMLTraits.h
index 3b1f4bad57fc..33aeb039320d 100644
--- a/llvm/include/llvm/Support/YAMLTraits.h
+++ b/llvm/include/llvm/Support/YAMLTraits.h
@@ -671,7 +671,11 @@ inline bool isBool(StringRef S) {
 // (except for TAB #x9, LF #xA, and CR #xD which are allowed), DEL #x7F, the C1
 // control block #x80-#x9F (except for NEL #x85 which is allowed), the surrogate
 // block #xD800-#xDFFF, #xFFFE, and #xFFFF.
-inline QuotingType needsQuotes(StringRef S) {
+//
+// Some strings are valid YAML values even unquoted, but without quotes are
+// interpreted as non-string type, for instance null, boolean or numeric values.
+// If ForcePreserveAsString is set, such strings are quoted.
+inline QuotingType needsQuotes(StringRef S, bool ForcePreserveAsString = true) {
   if (S.empty())
     return QuotingType::Single;
 
@@ -679,12 +683,14 @@ inline QuotingType needsQuotes(StringRef S) {
   if (isSpace(static_cast<unsigned char>(S.front())) ||
       isSpace(static_cast<unsigned char>(S.back())))
     MaxQuotingNeeded = QuotingType::Single;
-  if (isNull(S))
-    MaxQuotingNeeded = QuotingType::Single;
-  if (isBool(S))
-    MaxQuotingNeeded = QuotingType::Single;
-  if (isNumeric(S))
-    MaxQuotingNeeded = QuotingType::Single;
+  if (ForcePreserveAsString) {
+    if (isNull(S))
+      MaxQuotingNeeded = QuotingType::Single;
+    if (isBool(S))
+      MaxQuotingNeeded = QuotingType::Single;
+    if (isNumeric(S))
+      MaxQuotingNeeded = QuotingType::Single;
+  }
 
   // 7.3.3 Plain Style
   // Plain scalars must not begin with most indicators, as this would cause
@@ -1636,6 +1642,7 @@ public:
 
 private:
   void output(StringRef s);
+  void output(StringRef, QuotingType);
   void outputUpToEndOfLine(StringRef s);
   void newLineCheck(bool EmptySequence = false);
   void outputNewLine();
diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
index ea1f4fc3b85d..855d1aeddfae 100644
--- a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
+++ b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
@@ -461,9 +461,10 @@ public:
 
   OverflowResult computeOverflowForUnsignedMul(const Value *LHS,
                                                const Value *RHS,
-                                               const Instruction *CxtI) const {
-    return llvm::computeOverflowForUnsignedMul(LHS, RHS,
-                                               SQ.getWithInstruction(CxtI));
+                                               const Instruction *CxtI,
+                                               bool IsNSW = false) const {
+    return llvm::computeOverflowForUnsignedMul(
+        LHS, RHS, SQ.getWithInstruction(CxtI), IsNSW);
   }
 
   OverflowResult computeOverflowForSignedMul(const Value *LHS, const Value *RHS,
diff --git a/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h b/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
index 9ebb95007774..429d6a2e0523 100644
--- a/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
+++ b/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
@@ -62,6 +62,13 @@ namespace llvm {
                      LibFunc TheLibFunc, AttributeList AttributeList,
                      FunctionType *Invalid, ArgsTy... Args) = delete;
 
+  // Handle -mregparm for the given function.
+  // Note that this function is a rough approximation that only works for simple
+  // function signatures; it does not apply other relevant attributes for
+  // function signatures, including sign/zero-extension for arguments and return
+  // values.
+  void markRegisterParameterAttributes(Function *F);
+
   /// Check whether the library function is available on target and also that
   /// it in the current Module is a Function with the right type.
   bool isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI,
diff --git a/llvm/include/llvm/Transforms/Utils/GlobalStatus.h b/llvm/include/llvm/Transforms/Utils/GlobalStatus.h
index 60c91fc30174..c001e587313c 100644
--- a/llvm/include/llvm/Transforms/Utils/GlobalStatus.h
+++ b/llvm/include/llvm/Transforms/Utils/GlobalStatus.h
@@ -24,9 +24,9 @@ class Value;
 ///
 bool isSafeToDestroyConstant(const Constant *C);
 
-/// As we analyze each global, keep track of some information about it.  If we
-/// find out that the address of the global is taken, none of this info will be
-/// accurate.
+/// As we analyze each global or thread-local variable, keep track of some
+/// information about it.  If we find out that the address of the global is
+/// taken, none of this info will be accurate.
 struct GlobalStatus {
   /// True if the global's address is used in a comparison.
   bool IsCompared = false;
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index c06984c0d494..4061dae83c10 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -6281,11 +6281,11 @@ static Value *simplifyUnaryIntrinsic(Function *F, Value *Op0,
                m_Intrinsic<Intrinsic::pow>(m_SpecificFP(10.0), m_Value(X)))))
       return X;
     break;
-  case Intrinsic::experimental_vector_reverse:
-    // experimental.vector.reverse(experimental.vector.reverse(x)) -> x
+  case Intrinsic::vector_reverse:
+    // vector.reverse(vector.reverse(x)) -> x
     if (match(Op0, m_VecReverse(m_Value(X))))
       return X;
-    // experimental.vector.reverse(splat(X)) -> splat(X)
+    // vector.reverse(splat(X)) -> splat(X)
     if (isSplatValue(Op0))
       return Op0;
     break;
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index de38eddaa98f..1b461e7cfd01 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -6686,9 +6686,15 @@ llvm::computeConstantRangeIncludingKnownBits(const WithCache<const Value *> &V,
 
 OverflowResult llvm::computeOverflowForUnsignedMul(const Value *LHS,
                                                    const Value *RHS,
-                                                   const SimplifyQuery &SQ) {
+                                                   const SimplifyQuery &SQ,
+                                                   bool IsNSW) {
   KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, SQ);
   KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, SQ);
+
+  // mul nsw of two non-negative numbers is also nuw.
+  if (IsNSW && LHSKnown.isNonNegative() && RHSKnown.isNonNegative())
+    return OverflowResult::NeverOverflows;
+
   ConstantRange LHSRange = ConstantRange::fromKnownBits(LHSKnown, false);
   ConstantRange RHSRange = ConstantRange::fromKnownBits(RHSKnown, false);
   return mapOverflowResult(LHSRange.unsignedMulMayOverflow(RHSRange));
diff --git a/llvm/lib/BinaryFormat/ELF.cpp b/llvm/lib/BinaryFormat/ELF.cpp
index 8c10ed1a980b..9878f5769087 100644
--- a/llvm/lib/BinaryFormat/ELF.cpp
+++ b/llvm/lib/BinaryFormat/ELF.cpp
@@ -568,12 +568,11 @@ StringRef ELF::convertEMachineToArchName(uint16_t EMachine) {
   }
 }
 
-uint8_t ELF::convertOSToOSAbi(StringRef OS) {
-  std::string LowerOS = OS.lower();
-  return StringSwitch<uint16_t>(LowerOS)
+uint8_t ELF::convertNameToOSABI(StringRef Name) {
+  return StringSwitch<uint16_t>(Name)
       .StartsWith("hpux", ELFOSABI_HPUX)
       .StartsWith("netbsd", ELFOSABI_NETBSD)
-      .StartsWith("linux", ELFOSABI_LINUX)
+      .StartsWith("gnu", ELFOSABI_GNU)
       .StartsWith("hurd", ELFOSABI_HURD)
       .StartsWith("solaris", ELFOSABI_SOLARIS)
       .StartsWith("aix", ELFOSABI_AIX)
@@ -597,14 +596,14 @@ uint8_t ELF::convertOSToOSAbi(StringRef OS) {
       .Default(ELFOSABI_NONE);
 }
 
-StringRef ELF::convertOSAbiToOS(uint8_t OSAbi) {
-  switch (OSAbi) {
+StringRef ELF::convertOSABIToName(uint8_t OSABI) {
+  switch (OSABI) {
   case ELFOSABI_HPUX:
     return "hpux";
   case ELFOSABI_NETBSD:
     return "netbsd";
-  case ELFOSABI_LINUX:
-    return "linux";
+  case ELFOSABI_GNU:
+    return "gnu";
   case ELFOSABI_HURD:
     return "hurd";
   case ELFOSABI_SOLARIS:
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 8eaf78157550..339a1f1f2f00 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -8270,6 +8270,7 @@ static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI,
       IRBuilder<> Builder(Branch);
       if (UI->getParent() != Branch->getParent())
         UI->moveBefore(Branch);
+      UI->dropPoisonGeneratingFlags();
       Value *NewCmp = Builder.CreateCmp(ICmpInst::ICMP_EQ, UI,
                                         ConstantInt::get(UI->getType(), 0));
       LLVM_DEBUG(dbgs() << "Converting " << *Cmp << "\n");
@@ -8283,6 +8284,7 @@ static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI,
       IRBuilder<> Builder(Branch);
       if (UI->getParent() != Branch->getParent())
         UI->moveBefore(Branch);
+      UI->dropPoisonGeneratingFlags();
       Value *NewCmp = Builder.CreateCmp(Cmp->getPredicate(), UI,
                                         ConstantInt::get(UI->getType(), 0));
       LLVM_DEBUG(dbgs() << "Converting " << *Cmp << "\n");
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 031a271de5bd..8573b016d1e5 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -1639,8 +1639,7 @@ bool ComplexDeinterleavingGraph::checkNodes() {
 ComplexDeinterleavingGraph::NodePtr
 ComplexDeinterleavingGraph::identifyRoot(Instruction *RootI) {
   if (auto *Intrinsic = dyn_cast<IntrinsicInst>(RootI)) {
-    if (Intrinsic->getIntrinsicID() !=
-        Intrinsic::experimental_vector_interleave2)
+    if (Intrinsic->getIntrinsicID() != Intrinsic::vector_interleave2)
       return nullptr;
 
     auto *Real = dyn_cast<Instruction>(Intrinsic->getOperand(0));
@@ -1675,7 +1674,7 @@ ComplexDeinterleavingGraph::identifyDeinterleave(Instruction *Real,
   Value *FinalValue = nullptr;
   if (match(Real, m_ExtractValue<0>(m_Instruction(I))) &&
       match(Imag, m_ExtractValue<1>(m_Specific(I))) &&
-      match(I, m_Intrinsic<Intrinsic::experimental_vector_deinterleave2>(
+      match(I, m_Intrinsic<Intrinsic::vector_deinterleave2>(
                    m_Value(FinalValue)))) {
     NodePtr PlaceholderNode = prepareCompositeNode(
         llvm::ComplexDeinterleavingOperation::Deinterleave, Real, Imag);
@@ -1960,13 +1959,11 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
       // Splats that are not constant are interleaved where they are located
       Instruction *InsertPoint = (I->comesBefore(R) ? R : I)->getNextNode();
       IRBuilder<> IRB(InsertPoint);
-      ReplacementNode =
-          IRB.CreateIntrinsic(Intrinsic::experimental_vector_interleave2, NewTy,
-                              {Node->Real, Node->Imag});
+      ReplacementNode = IRB.CreateIntrinsic(Intrinsic::vector_interleave2,
+                                            NewTy, {Node->Real, Node->Imag});
     } else {
-      ReplacementNode =
-          Builder.CreateIntrinsic(Intrinsic::experimental_vector_interleave2,
-                                  NewTy, {Node->Real, Node->Imag});
+      ReplacementNode = Builder.CreateIntrinsic(
+          Intrinsic::vector_interleave2, NewTy, {Node->Real, Node->Imag});
     }
     break;
   }
@@ -1991,9 +1988,8 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
     auto *B = replaceNode(Builder, Node->Operands[1]);
     auto *NewMaskTy = VectorType::getDoubleElementsVectorType(
         cast<VectorType>(MaskReal->getType()));
-    auto *NewMask =
-        Builder.CreateIntrinsic(Intrinsic::experimental_vector_interleave2,
-                                NewMaskTy, {MaskReal, MaskImag});
+    auto *NewMask = Builder.CreateIntrinsic(Intrinsic::vector_interleave2,
+                                            NewMaskTy, {MaskReal, MaskImag});
     ReplacementNode = Builder.CreateSelect(NewMask, A, B);
     break;
   }
@@ -2021,8 +2017,8 @@ void ComplexDeinterleavingGraph::processReductionOperation(
   Value *InitImag = OldPHIImag->getIncomingValueForBlock(Incoming);
 
   IRBuilder<> Builder(Incoming->getTerminator());
-  auto *NewInit = Builder.CreateIntrinsic(
-      Intrinsic::experimental_vector_interleave2, NewVTy, {InitReal, InitImag});
+  auto *NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy,
+                                          {InitReal, InitImag});
 
   NewPHI->addIncoming(NewInit, Incoming);
   NewPHI->addIncoming(OperationReplacement, BackEdge);
@@ -2034,9 +2030,9 @@ void ComplexDeinterleavingGraph::processReductionOperation(
 
   Builder.SetInsertPoint(
       &*FinalReductionReal->getParent()->getFirstInsertionPt());
-  auto *Deinterleave = Builder.CreateIntrinsic(
-      Intrinsic::experimental_vector_deinterleave2,
-      OperationReplacement->getType(), OperationReplacement);
+  auto *Deinterleave = Builder.CreateIntrinsic(Intrinsic::vector_deinterleave2,
+                                               OperationReplacement->getType(),
+                                               OperationReplacement);
 
   auto *NewReal = Builder.CreateExtractValue(Deinterleave, (uint64_t)0);
   FinalReductionReal->replaceUsesOfWith(Real, NewReal);
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
index 51ab7b6262c6..529e50c8ebe0 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
@@ -694,6 +694,20 @@ unsigned GISelKnownBits::computeNumSignBits(Register R,
     const MachineMemOperand *MMO = *MI.memoperands_begin();
     return TyBits - MMO->getSizeInBits().getValue();
   }
+  case TargetOpcode::G_AND:
+  case TargetOpcode::G_OR:
+  case TargetOpcode::G_XOR: {
+    Register Src1 = MI.getOperand(1).getReg();
+    unsigned Src1NumSignBits =
+        computeNumSignBits(Src1, DemandedElts, Depth + 1);
+    if (Src1NumSignBits != 1) {
+      Register Src2 = MI.getOperand(2).getReg();
+      unsigned Src2NumSignBits =
+          computeNumSignBits(Src2, DemandedElts, Depth + 1);
+      FirstAnswer = std::min(Src1NumSignBits, Src2NumSignBits);
+    }
+    break;
+  }
   case TargetOpcode::G_TRUNC: {
     Register Src = MI.getOperand(1).getReg();
     LLT SrcTy = MRI.getType(Src);
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index d7b0c9aa1667..e26c6ca3d616 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1804,7 +1804,7 @@ bool IRTranslator::translateTrap(const CallInst &CI,
 
 bool IRTranslator::translateVectorInterleave2Intrinsic(
     const CallInst &CI, MachineIRBuilder &MIRBuilder) {
-  assert(CI.getIntrinsicID() == Intrinsic::experimental_vector_interleave2 &&
+  assert(CI.getIntrinsicID() == Intrinsic::vector_interleave2 &&
          "This function can only be called on the interleave2 intrinsic!");
   // Canonicalize interleave2 to G_SHUFFLE_VECTOR (similar to SelectionDAG).
   Register Op0 = getOrCreateVReg(*CI.getOperand(0));
@@ -1820,7 +1820,7 @@ bool IRTranslator::translateVectorInterleave2Intrinsic(
 
 bool IRTranslator::translateVectorDeinterleave2Intrinsic(
     const CallInst &CI, MachineIRBuilder &MIRBuilder) {
-  assert(CI.getIntrinsicID() == Intrinsic::experimental_vector_deinterleave2 &&
+  assert(CI.getIntrinsicID() == Intrinsic::vector_deinterleave2 &&
          "This function can only be called on the deinterleave2 intrinsic!");
   // Canonicalize deinterleave2 to shuffles that extract sub-vectors (similar to
   // SelectionDAG).
@@ -2572,15 +2572,15 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     return true;
   }
 
-  case Intrinsic::experimental_vector_interleave2:
-  case Intrinsic::experimental_vector_deinterleave2: {
+  case Intrinsic::vector_interleave2:
+  case Intrinsic::vector_deinterleave2: {
     // Both intrinsics have at least one operand.
     Value *Op0 = CI.getOperand(0);
     LLT ResTy = getLLTForType(*Op0->getType(), MIRBuilder.getDataLayout());
     if (!ResTy.isFixedVector())
       return false;
 
-    if (CI.getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
+    if (CI.getIntrinsicID() == Intrinsic::vector_interleave2)
       return translateVectorInterleave2Intrinsic(CI, MIRBuilder);
 
     return translateVectorDeinterleave2Intrinsic(CI, MIRBuilder);
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 8989eabbe6df..8c9065aec7fa 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -535,9 +535,9 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
       // At present, we only have intrinsics to represent (de)interleaving
       // with a factor of 2.
-      if (II->getIntrinsicID() == Intrinsic::experimental_vector_deinterleave2)
+      if (II->getIntrinsicID() == Intrinsic::vector_deinterleave2)
         Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
-      if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
+      if (II->getIntrinsicID() == Intrinsic::vector_interleave2)
         Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
     }
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b6d5b309ceb9..4b81185c6e31 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15459,6 +15459,12 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
   if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, /*PoisonOnly*/ false))
     return N0;
 
+  // We currently avoid folding freeze over SRA/SRL, due to the problems seen
+  // with (freeze (assert ext)) blocking simplifications of SRA/SRL. See for
+  // example https://reviews.llvm.org/D136529#4120959.
+  if (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)
+    return SDValue();
+
   // Fold freeze(op(x, ...)) -> op(freeze(x), ...).
   // Try to push freeze through instructions that propagate but don't produce
   // poison as far as possible. If an operand of freeze follows three
@@ -15483,7 +15489,7 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
   // the future, then this special handling can be removed.
   if (N0.getOpcode() == ISD::BUILD_VECTOR) {
     SDLoc DL(N0);
-    MVT VT = N0.getSimpleValueType();
+    EVT VT = N0.getValueType();
     if (llvm::ISD::isBuildVectorAllOnes(N0.getNode()))
       return DAG.getAllOnesConstant(DL, VT);
     if (llvm::ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index c381870ae5f4..46e54b5366d6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3556,6 +3556,12 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       Results.push_back(Expanded);
     break;
   }
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM: {
+    if (SDValue Expanded = TLI.expandFMINIMUM_FMAXIMUM(Node, DAG))
+      Results.push_back(Expanded);
+    break;
+  }
   case ISD::FSIN:
   case ISD::FCOS: {
     EVT VT = Node->getValueType(0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 1de43a4f60e3..8f87ee8e0939 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1049,6 +1049,13 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
       return;
     }
     break;
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:
+    if (SDValue Expanded = TLI.expandFMINIMUM_FMAXIMUM(Node, DAG)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
   case ISD::SMIN:
   case ISD::SMAX:
   case ISD::UMIN:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index dde10fd4b8c8..dfbfaa8c894f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5137,6 +5137,16 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::FREEZE:
   case ISD::CONCAT_VECTORS:
   case ISD::INSERT_SUBVECTOR:
+  case ISD::SADDSAT:
+  case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT:
+  case ISD::MULHU:
+  case ISD::MULHS:
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:
   case ISD::AND:
   case ISD::XOR:
   case ISD::ROTL:
@@ -5157,6 +5167,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::BUILD_PAIR:
     return false;
 
+  case ISD::SELECT_CC:
   case ISD::SETCC: {
     // Integer setcc cannot create undef or poison.
     if (Op.getOperand(0).getValueType().isInteger())
@@ -5166,7 +5177,8 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
     // based on options and flags. The options and flags also cause special
     // nonan condition codes to be used. Those condition codes may be preserved
     // even if the nonan flag is dropped somewhere.
-    ISD::CondCode CCCode = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+    unsigned CCOp = Opcode == ISD::SETCC ? 2 : 4;
+    ISD::CondCode CCCode = cast<CondCodeSDNode>(Op.getOperand(CCOp))->get();
     if (((unsigned)CCCode & 0x10U))
       return true;
 
@@ -5183,6 +5195,8 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
     return false;
 
   case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SRA:
     // If the max shift amount isn't in range, then the shift can create poison.
     return !getValidMaximumShiftAmountConstant(Op, DemandedElts);
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 0db484a5e06b..5caf868c83a2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7930,19 +7930,19 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
              DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ResultVT, Vec, Index));
     return;
   }
-  case Intrinsic::experimental_vector_reverse:
+  case Intrinsic::vector_reverse:
     visitVectorReverse(I);
     return;
-  case Intrinsic::experimental_vector_splice:
+  case Intrinsic::vector_splice:
     visitVectorSplice(I);
     return;
   case Intrinsic::callbr_landingpad:
     visitCallBrLandingPad(I);
     return;
-  case Intrinsic::experimental_vector_interleave2:
+  case Intrinsic::vector_interleave2:
     visitVectorInterleave(I);
     return;
-  case Intrinsic::experimental_vector_deinterleave2:
+  case Intrinsic::vector_deinterleave2:
     visitVectorDeinterleave(I);
     return;
   case Intrinsic::experimental_convergence_anchor:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index c938b3996be3..cdc1227fd572 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8381,6 +8381,64 @@ SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
   return SDValue();
 }
 
+SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(N);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  unsigned Opc = N->getOpcode();
+  EVT VT = N->getValueType(0);
+  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  bool IsMax = Opc == ISD::FMAXIMUM;
+
+  if (VT.isVector() &&
+      isOperationLegalOrCustomOrPromote(Opc, VT.getScalarType()))
+    return SDValue();
+
+  // First, implement comparison not propagating NaN. If no native fmin or fmax
+  // available, use plain select with setcc instead.
+  SDValue MinMax;
+  unsigned CompOpcIeee = IsMax ? ISD::FMAXNUM_IEEE : ISD::FMINNUM_IEEE;
+  unsigned CompOpc = IsMax ? ISD::FMAXNUM : ISD::FMINNUM;
+  if (isOperationLegalOrCustom(CompOpcIeee, VT)) {
+    MinMax = DAG.getNode(CompOpcIeee, DL, VT, LHS, RHS);
+  } else if (isOperationLegalOrCustom(CompOpc, VT)) {
+    MinMax = DAG.getNode(CompOpc, DL, VT, LHS, RHS);
+  } else {
+    // NaN (if exists) will be propagated later, so orderness doesn't matter.
+    SDValue Compare =
+        DAG.getSetCC(DL, CCVT, LHS, RHS, IsMax ? ISD::SETGT : ISD::SETLT);
+    MinMax = DAG.getSelect(DL, VT, Compare, LHS, RHS);
+  }
+
+  // Propagate any NaN of both operands
+  if (!N->getFlags().hasNoNaNs() &&
+      (!DAG.isKnownNeverNaN(RHS) || !DAG.isKnownNeverNaN(LHS))) {
+    ConstantFP *FPNaN = ConstantFP::get(
+        *DAG.getContext(), APFloat::getNaN(DAG.EVTToAPFloatSemantics(VT)));
+    MinMax = DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, LHS, RHS, ISD::SETUO),
+                           DAG.getConstantFP(*FPNaN, DL, VT), MinMax);
+  }
+
+  // fminimum/fmaximum requires -0.0 less than +0.0
+  if (!N->getFlags().hasNoSignedZeros() && !DAG.isKnownNeverZeroFloat(RHS) &&
+      !DAG.isKnownNeverZeroFloat(LHS)) {
+    SDValue IsZero = DAG.getSetCC(DL, CCVT, MinMax,
+                                  DAG.getConstantFP(0.0, DL, VT), ISD::SETEQ);
+    SDValue TestZero =
+        DAG.getTargetConstant(IsMax ? fcPosZero : fcNegZero, DL, MVT::i32);
+    SDValue LCmp = DAG.getSelect(
+        DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, LHS, TestZero), LHS,
+        MinMax);
+    SDValue RCmp = DAG.getSelect(
+        DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, RHS, TestZero), RHS,
+        LCmp);
+    MinMax = DAG.getSelect(DL, VT, IsZero, RCmp, MinMax);
+  }
+
+  return MinMax;
+}
+
 /// Returns a true value if if this FPClassTest can be performed with an ordered
 /// fcmp to 0, and a false value if it's an unordered fcmp to 0. Returns
 /// std::nullopt if it cannot be performed as a compare with 0.
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 6dc5b9aae769..5b02b0e94dda 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1092,17 +1092,24 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
     break;
   case 'e':
     if (Name.consume_front("experimental.vector.")) {
-      Intrinsic::ID ID = StringSwitch<Intrinsic::ID>(Name)
-                             .StartsWith("extract.", Intrinsic::vector_extract)
-                             .StartsWith("insert.", Intrinsic::vector_insert)
-                             .Default(Intrinsic::not_intrinsic);
+      Intrinsic::ID ID =
+          StringSwitch<Intrinsic::ID>(Name)
+              .StartsWith("extract.", Intrinsic::vector_extract)
+              .StartsWith("insert.", Intrinsic::vector_insert)
+              .StartsWith("splice.", Intrinsic::vector_splice)
+              .StartsWith("reverse.", Intrinsic::vector_reverse)
+              .StartsWith("interleave2.", Intrinsic::vector_interleave2)
+              .StartsWith("deinterleave2.", Intrinsic::vector_deinterleave2)
+              .Default(Intrinsic::not_intrinsic);
       if (ID != Intrinsic::not_intrinsic) {
         const auto *FT = F->getFunctionType();
         SmallVector<Type *, 2> Tys;
-        if (ID == Intrinsic::vector_extract)
+        if (ID == Intrinsic::vector_extract ||
+            ID == Intrinsic::vector_interleave2)
           // Extracting overloads the return type.
           Tys.push_back(FT->getReturnType());
-        Tys.push_back(FT->getParamType(0));
+        if (ID != Intrinsic::vector_interleave2)
+          Tys.push_back(FT->getParamType(0));
         if (ID == Intrinsic::vector_insert)
           // Inserting overloads the inserted type.
           Tys.push_back(FT->getParamType(1));
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index d6746d1d4382..9ec5a7deeec6 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -1171,8 +1171,7 @@ Value *IRBuilderBase::CreateVectorReverse(Value *V, const Twine &Name) {
   auto *Ty = cast<VectorType>(V->getType());
   if (isa<ScalableVectorType>(Ty)) {
     Module *M = BB->getParent()->getParent();
-    Function *F = Intrinsic::getDeclaration(
-        M, Intrinsic::experimental_vector_reverse, Ty);
+    Function *F = Intrinsic::getDeclaration(M, Intrinsic::vector_reverse, Ty);
     return Insert(CallInst::Create(F, V), Name);
   }
   // Keep the original behaviour for fixed vector
@@ -1191,8 +1190,7 @@ Value *IRBuilderBase::CreateVectorSplice(Value *V1, Value *V2, int64_t Imm,
 
   if (auto *VTy = dyn_cast<ScalableVectorType>(V1->getType())) {
     Module *M = BB->getParent()->getParent();
-    Function *F = Intrinsic::getDeclaration(
-        M, Intrinsic::experimental_vector_splice, VTy);
+    Function *F = Intrinsic::getDeclaration(M, Intrinsic::vector_splice, VTy);
 
     Value *Ops[] = {V1, V2, getInt32(Imm)};
     return Insert(CallInst::Create(F, Ops), Name);
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index d2babc748731..7ad1ad4cddb7 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -2889,7 +2889,7 @@ bool ShuffleVectorInst::isOneUseSingleSourceMask(int VF) const {
 bool ShuffleVectorInst::isInterleave(unsigned Factor) {
   FixedVectorType *OpTy = dyn_cast<FixedVectorType>(getOperand(0)->getType());
   // shuffle_vector can only interleave fixed length vectors - for scalable
-  // vectors, see the @llvm.experimental.vector.interleave2 intrinsic
+  // vectors, see the @llvm.vector.interleave2 intrinsic
   if (!OpTy)
     return false;
   unsigned OpNumElts = OpTy->getNumElements();
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index e42cc7e260ef..430e2ce89f6a 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6019,7 +6019,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
 
     break;
   }
-  case Intrinsic::experimental_vector_splice: {
+  case Intrinsic::vector_splice: {
     VectorType *VecTy = cast<VectorType>(Call.getType());
     int64_t Idx = cast<ConstantInt>(Call.getArgOperand(2))->getSExtValue();
     int64_t KnownMinNumElements = VecTy->getElementCount().getKnownMinValue();
diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp
index 1a82e45763a2..2e3ebe3d9073 100644
--- a/llvm/lib/MCA/InstrBuilder.cpp
+++ b/llvm/lib/MCA/InstrBuilder.cpp
@@ -542,8 +542,7 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI,
   const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID);
   if (SCDesc.NumMicroOps == MCSchedClassDesc::InvalidNumMicroOps) {
     return make_error<InstructionError<MCInst>>(
-        "found an unsupported instruction in the input assembly sequence.",
-        MCI);
+        "found an unsupported instruction in the input assembly sequence", MCI);
   }
 
   LLVM_DEBUG(dbgs() << "\n\t\tOpcode Name= " << MCII.getName(Opcode) << '\n');
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 90ba3b541553..594549034cc8 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -963,7 +963,8 @@ PassBuilder::buildInlinerPipeline(OptimizationLevel Level,
   MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
       RequireAnalysisPass<ShouldNotRunFunctionPassesAnalysis, Function>()));
 
-  MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0));
+  if (!isLTOPreLink(Phase))
+    MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0));
 
   // Make sure we don't affect potential future NoRerun CGSCC adaptors.
   MIWP.addLateModulePass(createModuleToFunctionPassAdaptor(
@@ -1005,8 +1006,9 @@ PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level,
       buildFunctionSimplificationPipeline(Level, Phase),
       PTO.EagerlyInvalidateAnalyses));
 
-  MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
-      CoroSplitPass(Level != OptimizationLevel::O0)));
+  if (!isLTOPreLink(Phase))
+    MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
+        CoroSplitPass(Level != OptimizationLevel::O0)));
 
   return MPM;
 }
@@ -1183,7 +1185,8 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   // and argument promotion.
   MPM.addPass(DeadArgumentEliminationPass());
 
-  MPM.addPass(CoroCleanupPass());
+  if (!isLTOPreLink(Phase))
+    MPM.addPass(CoroCleanupPass());
 
   // Optimize globals now that functions are fully simplified.
   MPM.addPass(GlobalOptPass());
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index e1846fcbffee..b61c59aacc0f 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -657,8 +657,8 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
                        : IndexedInstrProf::ProfVersion::CurrentVersion;
   // The WritePrevVersion handling will either need to be removed or updated
   // if the version is advanced beyond 12.
-  assert(IndexedInstrProf::ProfVersion::CurrentVersion ==
-         IndexedInstrProf::ProfVersion::Version12);
+  static_assert(IndexedInstrProf::ProfVersion::CurrentVersion ==
+                IndexedInstrProf::ProfVersion::Version12);
   if (static_cast<bool>(ProfileKind & InstrProfKind::IRInstrumentation))
     Header.Version |= VARIANT_MASK_IR_PROF;
   if (static_cast<bool>(ProfileKind & InstrProfKind::ContextSensitive))
diff --git a/llvm/lib/Support/YAMLTraits.cpp b/llvm/lib/Support/YAMLTraits.cpp
index 4aaf59be2ce5..7bb60894b335 100644
--- a/llvm/lib/Support/YAMLTraits.cpp
+++ b/llvm/lib/Support/YAMLTraits.cpp
@@ -718,40 +718,8 @@ void Output::scalarString(StringRef &S, QuotingType MustQuote) {
     outputUpToEndOfLine("''");
     return;
   }
-  if (MustQuote == QuotingType::None) {
-    // Only quote if we must.
-    outputUpToEndOfLine(S);
-    return;
-  }
-
-  const char *const Quote = MustQuote == QuotingType::Single ? "'" : "\"";
-  output(Quote); // Starting quote.
-
-  // When using double-quoted strings (and only in that case), non-printable characters may be
-  // present, and will be escaped using a variety of unicode-scalar and special short-form
-  // escapes. This is handled in yaml::escape.
-  if (MustQuote == QuotingType::Double) {
-    output(yaml::escape(S, /* EscapePrintable= */ false));
-    outputUpToEndOfLine(Quote);
-    return;
-  }
-
-  unsigned i = 0;
-  unsigned j = 0;
-  unsigned End = S.size();
-  const char *Base = S.data();
-
-  // When using single-quoted strings, any single quote ' must be doubled to be escaped.
-  while (j < End) {
-    if (S[j] == '\'') {                    // Escape quotes.
-      output(StringRef(&Base[i], j - i));  // "flush".
-      output(StringLiteral("''"));         // Print it as ''
-      i = j + 1;
-    }
-    ++j;
-  }
-  output(StringRef(&Base[i], j - i));
-  outputUpToEndOfLine(Quote); // Ending quote.
+  output(S, MustQuote);
+  outputUpToEndOfLine("");
 }
 
 void Output::blockScalarString(StringRef &S) {
@@ -801,6 +769,46 @@ void Output::output(StringRef s) {
   Out << s;
 }
 
+void Output::output(StringRef S, QuotingType MustQuote) {
+  if (MustQuote == QuotingType::None) {
+    // Only quote if we must.
+    output(S);
+    return;
+  }
+
+  StringLiteral Quote = MustQuote == QuotingType::Single ? StringLiteral("'")
+                                                         : StringLiteral("\"");
+  output(Quote); // Starting quote.
+
+  // When using double-quoted strings (and only in that case), non-printable
+  // characters may be present, and will be escaped using a variety of
+  // unicode-scalar and special short-form escapes. This is handled in
+  // yaml::escape.
+  if (MustQuote == QuotingType::Double) {
+    output(yaml::escape(S, /* EscapePrintable= */ false));
+    output(Quote);
+    return;
+  }
+
+  unsigned i = 0;
+  unsigned j = 0;
+  unsigned End = S.size();
+  const char *Base = S.data();
+
+  // When using single-quoted strings, any single quote ' must be doubled to be
+  // escaped.
+  while (j < End) {
+    if (S[j] == '\'') {                   // Escape quotes.
+      output(StringRef(&Base[i], j - i)); // "flush".
+      output(StringLiteral("''"));        // Print it as ''
+      i = j + 1;
+    }
+    ++j;
+  }
+  output(StringRef(&Base[i], j - i));
+  output(Quote); // Ending quote.
+}
+
 void Output::outputUpToEndOfLine(StringRef s) {
   output(s);
   if (StateStack.empty() || (!inFlowSeqAnyElement(StateStack.back()) &&
@@ -853,7 +861,7 @@ void Output::newLineCheck(bool EmptySequence) {
 }
 
 void Output::paddedKey(StringRef key) {
-  output(key);
+  output(key, needsQuotes(key, false));
   output(":");
   const char *spaces = "                ";
   if (key.size() < strlen(spaces))
@@ -872,7 +880,7 @@ void Output::flowKey(StringRef Key) {
     Column = ColumnAtMapFlowStart;
     output("  ");
   }
-  output(Key);
+  output(Key, needsQuotes(Key, false));
   output(": ");
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 409ba13b8c6a..cb7930f0cdee 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16398,7 +16398,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     IntrinsicInst *DI, LoadInst *LI) const {
   // Only deinterleave2 supported at present.
-  if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
+  if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
     return false;
 
   // Only a factor of 2 supported at present.
@@ -16473,7 +16473,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
 bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
     IntrinsicInst *II, StoreInst *SI) const {
   // Only interleave2 supported at present.
-  if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
+  if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
     return false;
 
   // Only a factor of 2 supported at present.
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index f654065c735d..243891249668 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1570,7 +1570,7 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     MI.eraseFromParent();
     return true;
   }
-  case Intrinsic::experimental_vector_reverse:
+  case Intrinsic::vector_reverse:
     // TODO: Add support for vector_reverse
     return false;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 7993b6312111..3124fb23fb0b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -313,11 +313,17 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
   setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
+  setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
   setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
+  setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
   setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
+  setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
   setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
+  setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
   setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
+  setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
   setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
 
   setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
@@ -325,6 +331,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 
   setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
+  setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
   setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
 
   setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
@@ -334,17 +341,21 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
   setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
   setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
+  setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
   setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
 
   setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
   setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
   setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
+  setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
   setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
 
   setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
+  setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
   setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
 
   setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
+  setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
   setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
   setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
   setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index c4ec7a7befd4..510f5bbf2555 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -385,8 +385,8 @@ public:
   bool isIdxen() const { return isImmTy(ImmTyIdxen); }
   bool isAddr64() const { return isImmTy(ImmTyAddr64); }
   bool isOffset() const { return isImmTy(ImmTyOffset); }
-  bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<8>(getImm()); }
-  bool isOffset1() const { return isImmTy(ImmTyOffset1) && isUInt<8>(getImm()); }
+  bool isOffset0() const { return isImmTy(ImmTyOffset0); }
+  bool isOffset1() const { return isImmTy(ImmTyOffset1); }
   bool isSMEMOffsetMod() const { return isImmTy(ImmTySMEMOffsetMod); }
   bool isFlatOffset() const { return isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset); }
   bool isGDS() const { return isImmTy(ImmTyGDS); }
@@ -411,9 +411,7 @@ public:
   bool isOpSelHi() const { return isImmTy(ImmTyOpSelHi); }
   bool isNegLo() const { return isImmTy(ImmTyNegLo); }
   bool isNegHi() const { return isImmTy(ImmTyNegHi); }
-  bool isByteSel() const {
-    return isImmTy(ImmTyByteSel) && isUInt<2>(getImm());
-  }
+  bool isByteSel() const { return isImmTy(ImmTyByteSel); }
 
   bool isRegOrImm() const {
     return isReg() || isImm();
@@ -8939,11 +8937,11 @@ bool AMDGPUOperand::isBLGP() const {
 }
 
 bool AMDGPUOperand::isCBSZ() const {
-  return isImm() && getImmTy() == ImmTyCBSZ && isUInt<3>(getImm());
+  return isImm() && getImmTy() == ImmTyCBSZ;
 }
 
 bool AMDGPUOperand::isABID() const {
-  return isImm() && getImmTy() == ImmTyABID && isUInt<4>(getImm());
+  return isImm() && getImmTy() == ImmTyABID;
 }
 
 bool AMDGPUOperand::isS16Imm() const {
@@ -9670,25 +9668,17 @@ bool AMDGPUOperand::isEndpgm() const { return isImmTy(ImmTyEndpgm); }
 // LDSDIR
 //===----------------------------------------------------------------------===//
 
-bool AMDGPUOperand::isWaitVDST() const {
-  return isImmTy(ImmTyWaitVDST) && isUInt<4>(getImm());
-}
+bool AMDGPUOperand::isWaitVDST() const { return isImmTy(ImmTyWaitVDST); }
 
-bool AMDGPUOperand::isWaitVAVDst() const {
-  return isImmTy(ImmTyWaitVAVDst) && isUInt<4>(getImm());
-}
+bool AMDGPUOperand::isWaitVAVDst() const { return isImmTy(ImmTyWaitVAVDst); }
 
-bool AMDGPUOperand::isWaitVMVSrc() const {
-  return isImmTy(ImmTyWaitVMVSrc) && isUInt<1>(getImm());
-}
+bool AMDGPUOperand::isWaitVMVSrc() const { return isImmTy(ImmTyWaitVMVSrc); }
 
 //===----------------------------------------------------------------------===//
 // VINTERP
 //===----------------------------------------------------------------------===//
 
-bool AMDGPUOperand::isWaitEXP() const {
-  return isImmTy(ImmTyWaitEXP) && isUInt<3>(getImm());
-}
+bool AMDGPUOperand::isWaitEXP() const { return isImmTy(ImmTyWaitEXP); }
 
 //===----------------------------------------------------------------------===//
 // Split Barrier
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index e20fe1b716b6..76b90042d65f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -461,8 +461,10 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
     return true;
   }
 
-  if (isMIMG(LdSt)) {
-    int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
+  if (isImage(LdSt)) {
+    auto RsrcOpName =
+        isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
+    int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
     BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
     if (VAddr0Idx >= 0) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index bf6cfe90ebfb..7189e6e40506 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1000,8 +1000,10 @@ def SDWAVopcDst : BoolRC {
 }
 
 class NamedIntOperand<ValueType Type, string Prefix, bit Optional = 1,
-                      string name = NAME, string ConvertMethod = "nullptr">
+                      string name = NAME>
     : CustomOperand<Type, Optional, name> {
+  string Validator = "[](int64_t V) { return true; }";
+  string ConvertMethod = "[](int64_t &V) { return "#Validator#"(V); }";
   let ParserMethod =
     "[this](OperandVector &Operands) -> ParseStatus { "#
     "return parseIntWithPrefix(\""#Prefix#"\", Operands, "#
@@ -1045,8 +1047,10 @@ class ArrayOperand0<string Id, string Name = NAME>
 let ImmTy = "ImmTyOffset" in
 def flat_offset : CustomOperand<i32, 1, "FlatOffset">;
 def Offset : NamedIntOperand<i32, "offset">;
+let Validator = "isUInt<8>" in {
 def Offset0 : NamedIntOperand<i8, "offset0">;
 def Offset1 : NamedIntOperand<i8, "offset1">;
+}
 
 def gds : NamedBitOperand<"gds", "GDS">;
 
@@ -1103,27 +1107,41 @@ let DefaultValue = "0xf" in {
 def DppRowMask : NamedIntOperand<i32, "row_mask">;
 def DppBankMask : NamedIntOperand<i32, "bank_mask">;
 }
-def DppBoundCtrl : NamedIntOperand<i1, "bound_ctrl", 1, "DppBoundCtrl",
-    "[this] (int64_t &BC) -> bool { return convertDppBoundCtrl(BC); }">;
+def DppBoundCtrl : NamedIntOperand<i1, "bound_ctrl"> {
+  let ConvertMethod = "[this] (int64_t &BC) -> bool { return convertDppBoundCtrl(BC); }";
+}
 
 let DecoderMethod = "decodeDpp8FI" in
 def Dpp8FI : NamedIntOperand<i32, "fi", 1, "DppFI">;
 def Dpp16FI : NamedIntOperand<i32, "fi", 1, "DppFI">;
 
 def blgp : CustomOperand<i32, 1, "BLGP">;
-def CBSZ : NamedIntOperand<i32, "cbsz">;
-def ABID : NamedIntOperand<i32, "abid">;
-
+def CBSZ : NamedIntOperand<i32, "cbsz"> {
+  let Validator = "isUInt<3>";
+}
+def ABID : NamedIntOperand<i32, "abid"> {
+  let Validator = "isUInt<4>";
+}
 def hwreg : CustomOperand<i32, 0, "Hwreg">;
 
 def exp_tgt : CustomOperand<i32, 0, "ExpTgt">;
 
-def WaitVDST : NamedIntOperand<i8, "wait_vdst">;
-def WaitEXP : NamedIntOperand<i8, "wait_exp">;
-def WaitVAVDst : NamedIntOperand<i8, "wait_va_vdst">;
-def WaitVMVSrc : NamedIntOperand<i8, "wait_vm_vsrc">;
+def WaitVDST : NamedIntOperand<i8, "wait_vdst"> {
+  let Validator = "isUInt<4>";
+}
+def WaitEXP : NamedIntOperand<i8, "wait_exp"> {
+  let Validator = "isUInt<3>";
+}
+def WaitVAVDst : NamedIntOperand<i8, "wait_va_vdst"> {
+  let Validator = "isUInt<4>";
+}
+def WaitVMVSrc : NamedIntOperand<i8, "wait_vm_vsrc"> {
+  let Validator = "isUInt<1>";
+}
 
-def ByteSel : NamedIntOperand<i8, "byte_sel">;
+def ByteSel : NamedIntOperand<i8, "byte_sel"> {
+  let Validator = "isUInt<2>";
+}
 
 class KImmFPOperand<ValueType vt> : ImmOperand<vt> {
   let OperandNamespace = "AMDGPU";
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index bf4a501cc315..072c5aedc220 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -110,7 +110,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   }
 
   if (!AMDGPU::isGraphics(CC) ||
-      ((CC == CallingConv::AMDGPU_CS || CC == CallingConv::AMDGPU_CS) &&
+      ((CC == CallingConv::AMDGPU_CS || CC == CallingConv::AMDGPU_Gfx) &&
        ST.hasArchitectedSGPRs())) {
     if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x"))
       WorkGroupIDX = true;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index d0e9f61c0bd1..f67a68acbf23 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1555,15 +1555,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
 
   if (Subtarget->hasNEON()) {
     // vmin and vmax aren't available in a scalar form, so we can use
-    // a NEON instruction with an undef lane instead.  This has a performance
-    // penalty on some cores, so we don't do this unless we have been
-    // asked to by the core tuning model.
-    if (Subtarget->useNEONForSinglePrecisionFP()) {
-      setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
-      setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
-      setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
-      setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
-    }
+    // a NEON instruction with an undef lane instead.
+    setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
     setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
     setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
     setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index ebd8447eba85..8c9f5c4dc554 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -973,8 +973,7 @@ void BTFDebug::visitMapDefType(const DIType *Ty, uint32_t &TypeId) {
 }
 
 /// Read file contents from the actual file or from the source
-std::string BTFDebug::populateFileContent(const DISubprogram *SP) {
-  auto File = SP->getFile();
+std::string BTFDebug::populateFileContent(const DIFile *File) {
   std::string FileName;
 
   if (!File->getFilename().starts_with("/") && File->getDirectory().size())
@@ -1005,9 +1004,9 @@ std::string BTFDebug::populateFileContent(const DISubprogram *SP) {
   return FileName;
 }
 
-void BTFDebug::constructLineInfo(const DISubprogram *SP, MCSymbol *Label,
+void BTFDebug::constructLineInfo(MCSymbol *Label, const DIFile *File,
                                  uint32_t Line, uint32_t Column) {
-  std::string FileName = populateFileContent(SP);
+  std::string FileName = populateFileContent(File);
   BTFLineInfo LineInfo;
 
   LineInfo.Label = Label;
@@ -1366,10 +1365,10 @@ void BTFDebug::beginInstruction(const MachineInstr *MI) {
   if (!CurMI) // no debug info
     return;
 
-  // Skip this instruction if no DebugLoc or the DebugLoc
-  // is the same as the previous instruction.
+  // Skip this instruction if no DebugLoc, the DebugLoc
+  // is the same as the previous instruction or Line is 0.
   const DebugLoc &DL = MI->getDebugLoc();
-  if (!DL || PrevInstLoc == DL) {
+  if (!DL || PrevInstLoc == DL || DL.getLine() == 0) {
     // This instruction will be skipped, no LineInfo has
     // been generated, construct one based on function signature.
     if (LineInfoGenerated == false) {
@@ -1377,7 +1376,7 @@ void BTFDebug::beginInstruction(const MachineInstr *MI) {
       if (!S)
         return;
       MCSymbol *FuncLabel = Asm->getFunctionBegin();
-      constructLineInfo(S, FuncLabel, S->getLine(), 0);
+      constructLineInfo(FuncLabel, S->getFile(), S->getLine(), 0);
       LineInfoGenerated = true;
     }
 
@@ -1389,8 +1388,7 @@ void BTFDebug::beginInstruction(const MachineInstr *MI) {
   OS.emitLabel(LineSym);
 
   // Construct the lineinfo.
-  auto SP = DL->getScope()->getSubprogram();
-  constructLineInfo(SP, LineSym, DL.getLine(), DL.getCol());
+  constructLineInfo(LineSym, DL->getFile(), DL.getLine(), DL.getCol());
 
   LineInfoGenerated = true;
   PrevInstLoc = DL;
diff --git a/llvm/lib/Target/BPF/BTFDebug.h b/llvm/lib/Target/BPF/BTFDebug.h
index 7536006ed21c..11a0c59ba6c9 100644
--- a/llvm/lib/Target/BPF/BTFDebug.h
+++ b/llvm/lib/Target/BPF/BTFDebug.h
@@ -343,10 +343,10 @@ class BTFDebug : public DebugHandlerBase {
 
   /// Get the file content for the subprogram. Certain lines of the file
   /// later may be put into string table and referenced by line info.
-  std::string populateFileContent(const DISubprogram *SP);
+  std::string populateFileContent(const DIFile *File);
 
   /// Construct a line info.
-  void constructLineInfo(const DISubprogram *SP, MCSymbol *Label, uint32_t Line,
+  void constructLineInfo(MCSymbol *Label, const DIFile *File, uint32_t Line,
                          uint32_t Column);
 
   /// Generate types and variables for globals.
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 7ca20190731a..b9e8e1f33d3a 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -656,12 +656,44 @@ DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
                                                ArrayRef<uint8_t> Bytes,
                                                uint64_t Address,
                                                raw_ostream &CS) const {
-  // TODO: This will need modification when supporting instruction set
-  // extensions with instructions > 32-bits (up to 176 bits wide).
+  // It's a 16 bit instruction if bit 0 and 1 are not 0b11.
+  if ((Bytes[0] & 0b11) != 0b11)
+    return getInstruction16(MI, Size, Bytes, Address, CS);
 
-  // It's a 32 bit instruction if bit 0 and 1 are 1.
-  if ((Bytes[0] & 0x3) == 0x3)
+  // It's a 32 bit instruction if bit 1:0 are 0b11(checked above) and bits 4:2
+  // are not 0b111.
+  if ((Bytes[0] & 0b1'1100) != 0b1'1100)
     return getInstruction32(MI, Size, Bytes, Address, CS);
 
-  return getInstruction16(MI, Size, Bytes, Address, CS);
+  // 48-bit instructions are encoded as 0bxx011111.
+  if ((Bytes[0] & 0b11'1111) == 0b01'1111) {
+    Size = Bytes.size() >= 6 ? 6 : 0;
+    return MCDisassembler::Fail;
+  }
+
+  // 64-bit instructions are encoded as 0x0111111.
+  if ((Bytes[0] & 0b111'1111) == 0b011'1111) {
+    Size = Bytes.size() >= 8 ? 8 : 0;
+    return MCDisassembler::Fail;
+  }
+
+  // Remaining cases need to check a second byte.
+  if (Bytes.size() < 2) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  // 80-bit through 176-bit instructions are encoded as 0bxnnnxxxx_x1111111.
+  // Where the number of bits is (80 + (nnn * 16)) for nnn != 0b111.
+  unsigned nnn = (Bytes[1] >> 4) & 0b111;
+  if (nnn != 0b111) {
+    Size = 10 + (nnn * 2);
+    if (Bytes.size() < Size)
+      Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  // Remaining encodings are reserved for > 176-bit instructions.
+  Size = 0;
+  return MCDisassembler::Fail;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 3ab9e7d69105..68f4ec5ef49f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -524,8 +524,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     if (Subtarget.is64Bit())
       setOperationAction(ISD::FPOWI, MVT::i32, Custom);
 
-    if (!Subtarget.hasStdExtZfa())
-      setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);
+    setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16,
+                       Subtarget.hasStdExtZfa() ? Legal : Custom);
   }
 
   if (Subtarget.hasStdExtFOrZfinx()) {
@@ -548,10 +548,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Custom);
 
-    if (Subtarget.hasStdExtZfa())
+    if (Subtarget.hasStdExtZfa()) {
       setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
-    else
+      setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
+    } else {
       setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Custom);
+    }
   }
 
   if (Subtarget.hasStdExtFOrZfinx() && Subtarget.is64Bit())
@@ -566,6 +568,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     if (Subtarget.hasStdExtZfa()) {
       setOperationAction(FPRndMode, MVT::f64, Legal);
       setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
+      setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f64, Legal);
     } else {
       if (Subtarget.is64Bit())
         setOperationAction(FPRndMode, MVT::f64, Custom);
@@ -16165,28 +16168,36 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     return performSELECTCombine(N, DAG, Subtarget);
   case RISCVISD::CZERO_EQZ:
   case RISCVISD::CZERO_NEZ: {
-    SDValue LHS = N->getOperand(0);
-    SDValue RHS = N->getOperand(1);
-    // czero_eq X, (xor Y, 1) -> czero_ne X, Y if Y is 0 or 1.
-    // czero_ne X, (xor Y, 1) -> czero_eq X, Y if Y is 0 or 1.
-    if (RHS.getOpcode() == ISD::XOR && isOneConstant(RHS.getOperand(1))) {
-      SDValue Cond = RHS.getOperand(0);
-      APInt Mask = APInt::getBitsSetFrom(Cond.getValueSizeInBits(), 1);
-      if (DAG.MaskedValueIsZero(Cond, Mask)) {
-        unsigned NewOpc = N->getOpcode() == RISCVISD::CZERO_EQZ
-                              ? RISCVISD::CZERO_NEZ
-                              : RISCVISD::CZERO_EQZ;
-        return DAG.getNode(NewOpc, SDLoc(N), N->getValueType(0), LHS, Cond);
-      }
+    SDValue Val = N->getOperand(0);
+    SDValue Cond = N->getOperand(1);
+
+    unsigned Opc = N->getOpcode();
+
+    // czero_eqz x, x -> x
+    if (Opc == RISCVISD::CZERO_EQZ && Val == Cond)
+      return Val;
+
+    unsigned InvOpc =
+        Opc == RISCVISD::CZERO_EQZ ? RISCVISD::CZERO_NEZ : RISCVISD::CZERO_EQZ;
+
+    // czero_eqz X, (xor Y, 1) -> czero_nez X, Y if Y is 0 or 1.
+    // czero_nez X, (xor Y, 1) -> czero_eqz X, Y if Y is 0 or 1.
+    if (Cond.getOpcode() == ISD::XOR && isOneConstant(Cond.getOperand(1))) {
+      SDValue NewCond = Cond.getOperand(0);
+      APInt Mask = APInt::getBitsSetFrom(NewCond.getValueSizeInBits(), 1);
+      if (DAG.MaskedValueIsZero(NewCond, Mask))
+        return DAG.getNode(InvOpc, SDLoc(N), N->getValueType(0), Val, NewCond);
+    }
+    // czero_eqz x, (setcc y, 0, ne) -> czero_eqz x, y
+    // czero_nez x, (setcc y, 0, ne) -> czero_nez x, y
+    // czero_eqz x, (setcc y, 0, eq) -> czero_nez x, y
+    // czero_nez x, (setcc y, 0, eq) -> czero_eqz x, y
+    if (Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
+      ISD::CondCode CCVal = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+      if (ISD::isIntEqualitySetCC(CCVal))
+        return DAG.getNode(CCVal == ISD::SETNE ? Opc : InvOpc, SDLoc(N),
+                           N->getValueType(0), Val, Cond.getOperand(0));
     }
-    // czero_eqz x, (setcc x, 0, ne) -> x
-    // czero_nez x, (setcc x, 0, eq) -> x
-    if (RHS.getOpcode() == ISD::SETCC && isNullConstant(RHS.getOperand(1)) &&
-        cast<CondCodeSDNode>(RHS.getOperand(2))->get() ==
-            (N->getOpcode() == RISCVISD::CZERO_EQZ ? ISD::CondCode::SETNE
-                                                   : ISD::CondCode::SETEQ) &&
-        LHS == RHS.getOperand(0))
-      return LHS;
     return SDValue();
   }
   case RISCVISD::SELECT_CC: {
@@ -21012,7 +21023,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
   IRBuilder<> Builder(LI);
 
   // Only deinterleave2 supported at present.
-  if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
+  if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
     return false;
 
   unsigned Factor = 2;
@@ -21062,7 +21073,7 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
   IRBuilder<> Builder(SI);
 
   // Only interleave2 supported at present.
-  if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
+  if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
     return false;
 
   unsigned Factor = 2;
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index b5fd508fa77d..b27e1dd258eb 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -568,8 +568,8 @@ public:
 
   bool hasSameAVL(const VSETVLIInfo &Other) const {
     if (hasAVLReg() && Other.hasAVLReg())
-      return getAVLDefMI().isIdenticalTo(Other.getAVLDefMI()) &&
-             getAVLReg() == Other.getAVLReg();
+      return AVLRegDef.DefMI == Other.AVLRegDef.DefMI &&
+             AVLRegDef.DefReg == Other.AVLRegDef.DefReg;
 
     if (hasAVLImm() && Other.hasAVLImm())
       return getAVLImm() == Other.getAVLImm();
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index c0a75e215a40..8e4015783641 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -1168,33 +1168,34 @@ inline int getMemoryOperandNo(uint64_t TSFlags) {
 
 /// \returns true if the register is a XMM.
 inline bool isXMMReg(unsigned RegNo) {
-  assert(X86::XMM15 - X86::XMM0 == 15 &&
-         "XMM0-15 registers are not continuous");
-  assert(X86::XMM31 - X86::XMM16 == 15 &&
-         "XMM16-31 registers are not continuous");
+  static_assert(X86::XMM15 - X86::XMM0 == 15,
+                "XMM0-15 registers are not continuous");
+  static_assert(X86::XMM31 - X86::XMM16 == 15,
+                "XMM16-31 registers are not continuous");
   return (RegNo >= X86::XMM0 && RegNo <= X86::XMM15) ||
          (RegNo >= X86::XMM16 && RegNo <= X86::XMM31);
 }
 
 /// \returns true if the register is a YMM.
 inline bool isYMMReg(unsigned RegNo) {
-  assert(X86::YMM15 - X86::YMM0 == 15 &&
-         "YMM0-15 registers are not continuous");
-  assert(X86::YMM31 - X86::YMM16 == 15 &&
-         "YMM16-31 registers are not continuous");
+  static_assert(X86::YMM15 - X86::YMM0 == 15,
+                "YMM0-15 registers are not continuous");
+  static_assert(X86::YMM31 - X86::YMM16 == 15,
+                "YMM16-31 registers are not continuous");
   return (RegNo >= X86::YMM0 && RegNo <= X86::YMM15) ||
          (RegNo >= X86::YMM16 && RegNo <= X86::YMM31);
 }
 
 /// \returns true if the register is a ZMM.
 inline bool isZMMReg(unsigned RegNo) {
-  assert(X86::ZMM31 - X86::ZMM0 == 31 && "ZMM registers are not continuous");
+  static_assert(X86::ZMM31 - X86::ZMM0 == 31,
+                "ZMM registers are not continuous");
   return RegNo >= X86::ZMM0 && RegNo <= X86::ZMM31;
 }
 
 /// \returns true if \p RegNo is an apx extended register.
 inline bool isApxExtendedReg(unsigned RegNo) {
-  assert(X86::R31WH - X86::R16 == 95 && "EGPRs are not continuous");
+  static_assert(X86::R31WH - X86::R16 == 95, "EGPRs are not continuous");
   return RegNo >= X86::R16 && RegNo <= X86::R31WH;
 }
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 82d3859225fd..a811ce43422e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4441,10 +4441,8 @@ static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
          "Expected a 128/256/512-bit vector type");
-
-  APInt Ones = APInt::getAllOnes(32);
   unsigned NumElts = VT.getSizeInBits() / 32;
-  SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
+  SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
   return DAG.getBitcast(VT, Vec);
 }
 
@@ -20394,14 +20392,16 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
   EVT SrcVT = In.getValueType();
   EVT DstSVT = DstVT.getVectorElementType();
   EVT SrcSVT = SrcVT.getVectorElementType();
+  unsigned NumDstEltBits = DstSVT.getSizeInBits();
+  unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
 
   // Check we have a truncation suited for PACKSS/PACKUS.
   if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
         (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
     return SDValue();
 
-  assert(SrcSVT.getSizeInBits() > DstSVT.getSizeInBits() && "Bad truncation");
-  unsigned NumStages = Log2_32(SrcSVT.getSizeInBits() / DstSVT.getSizeInBits());
+  assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
+  unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
 
   // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
   // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
@@ -20422,8 +20422,7 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
   if (Subtarget.hasAVX512() && NumStages > 1)
     return SDValue();
 
-  unsigned NumSrcEltBits = SrcVT.getScalarSizeInBits();
-  unsigned NumPackedSignBits = std::min<unsigned>(DstSVT.getSizeInBits(), 16);
+  unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
   unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
 
   // Truncate with PACKUS if we are truncating a vector with leading zero
@@ -20445,7 +20444,7 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
   // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
   // see through BITCASTs later on and combines/simplifications can't then use
   // it.
-  if (DstSVT == MVT::i32 && NumSignBits != SrcSVT.getSizeInBits() &&
+  if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
       !Subtarget.hasAVX512())
     return SDValue();
 
@@ -24140,8 +24139,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
               DAG.getConstant(1, DL, VT));
         else
           Neg = CmpOp0;
-        SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
-                                   Neg); // -(and (x, 0x1))
+        SDValue Mask = DAG.getNegative(Neg, DL, VT); // -(and (x, 0x1))
         SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
         return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2);  // And Op y
       }
@@ -28150,9 +28148,8 @@ static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
   // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
   if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
     SDValue Src = Op.getOperand(0);
-    SDValue Sub =
-        DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
-    return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
+    SDValue Neg = DAG.getNegative(Src, DL, VT);
+    return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
   }
 
   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
@@ -29373,10 +29370,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
   // +ve/-ve Amt = shift left/right.
   if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
                              VT == MVT::v8i16 || VT == MVT::v16i8)) {
-    if (Opc == ISD::SRL || Opc == ISD::SRA) {
-      SDValue Zero = DAG.getConstant(0, dl, VT);
-      Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
-    }
+    if (Opc == ISD::SRL || Opc == ISD::SRA)
+      Amt = DAG.getNegative(Amt, dl, VT);
     if (Opc == ISD::SHL || Opc == ISD::SRL)
       return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
     if (Opc == ISD::SRA)
diff --git a/llvm/lib/Target/X86/X86LowerTileCopy.cpp b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
index e7afc49240e5..fd05e16ac1ce 100644
--- a/llvm/lib/Target/X86/X86LowerTileCopy.cpp
+++ b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
@@ -20,6 +20,7 @@
 #include "X86InstrBuilder.h"
 #include "X86InstrInfo.h"
 #include "X86Subtarget.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -72,10 +73,28 @@ FunctionPass *llvm::createX86LowerTileCopyPass() {
 bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
   const X86InstrInfo *TII = ST.getInstrInfo();
+  const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+  BitVector GR64Regs =
+      TRI->getAllocatableSet(MF, TRI->getRegClass(X86::GR64RegClassID));
+  BitVector TILERegs =
+      TRI->getAllocatableSet(MF, TRI->getRegClass(X86::TILERegClassID));
   bool Changed = false;
 
   for (MachineBasicBlock &MBB : MF) {
-    for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
+    // There won't be a tile copy if no tile register live in.
+    bool HasTileCopy = false;
+    for (const auto &LI : MBB.liveins()) {
+      if (TILERegs.test(LI.PhysReg)) {
+        HasTileCopy = true;
+        break;
+      }
+    }
+    if (!HasTileCopy)
+      continue;
+    LiveRegUnits UsedRegs(*TRI);
+    UsedRegs.addLiveOuts(MBB);
+    for (MachineInstr &MI : llvm::make_early_inc_range(reverse(MBB))) {
+      UsedRegs.stepBackward(MI);
       if (!MI.isCopy())
         continue;
       MachineOperand &DstMO = MI.getOperand(0);
@@ -85,27 +104,41 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
       if (!X86::TILERegClass.contains(DstReg, SrcReg))
         continue;
 
-      const TargetRegisterInfo *TRI = ST.getRegisterInfo();
       // Allocate stack slot for tile register
       unsigned Size = TRI->getSpillSize(X86::TILERegClass);
       Align Alignment = TRI->getSpillAlign(X86::TILERegClass);
       int TileSS = MF.getFrameInfo().CreateSpillStackObject(Size, Alignment);
-      // Allocate stack slot for stride register
-      Size = TRI->getSpillSize(X86::GR64RegClass);
-      Alignment = TRI->getSpillAlign(X86::GR64RegClass);
-      int StrideSS = MF.getFrameInfo().CreateSpillStackObject(Size, Alignment);
 
-      // TODO: Pick a killed regiter to avoid save/reload. There is problem
-      // to get live interval in this stage.
-      Register GR64Cand = X86::RAX;
+      int StrideSS = 0;
+
+      // Pick a killed register to avoid a save/reload.
+      Register GR64Cand = X86::NoRegister;
+      for (auto RegT : GR64Regs.set_bits()) {
+        if (UsedRegs.available(RegT)) {
+          GR64Cand = RegT;
+          break;
+        }
+      }
 
       const DebugLoc &DL = MI.getDebugLoc();
-      // mov %rax (%sp)
-      BuildMI(MBB, MI, DL, TII->get(X86::IMPLICIT_DEF), GR64Cand);
-      addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV64mr)), StrideSS)
-          .addReg(GR64Cand);
-      // mov 64 %rax
-      BuildMI(MBB, MI, DL, TII->get(X86::MOV64ri), GR64Cand).addImm(64);
+      if (GR64Cand) {
+        // mov 64 %reg
+        BuildMI(MBB, MI, DL, TII->get(X86::MOV64ri), GR64Cand).addImm(64);
+      } else {
+        // No available register? Save RAX and reload it after use.
+
+        // Allocate stack slot for stride register
+        Size = TRI->getSpillSize(X86::GR64RegClass);
+        Alignment = TRI->getSpillAlign(X86::GR64RegClass);
+        StrideSS = MF.getFrameInfo().CreateSpillStackObject(Size, Alignment);
+
+        // mov %reg (%sp)
+        addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV64mr)),
+                          StrideSS)
+            .addReg(X86::RAX);
+        // mov 64 %reg
+        BuildMI(MBB, MI, DL, TII->get(X86::MOV64ri), X86::RAX).addImm(64);
+      }
       // tilestored %tmm, (%sp, %idx)
 #define GET_EGPR_IF_ENABLED(OPC) (ST.hasEGPR() ? OPC##_EVEX : OPC)
       unsigned Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
@@ -120,10 +153,12 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
 #undef GET_EGPR_IF_ENABLED
       NewMI = addFrameReference(BuildMI(MBB, MI, DL, TII->get(Opc), DstReg),
                                 TileSS);
-      // restore %rax
-      // mov (%sp) %rax
-      addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm), GR64Cand),
-                        StrideSS);
+      if (!GR64Cand) {
+        // restore %rax
+        // mov (%sp) %rax
+        addFrameReference(
+            BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm), GR64Cand), StrideSS);
+      }
       MI.eraseFromParent();
       Changed = true;
     }
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index be0cf1596d0d..555ede9e9540 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -649,10 +649,11 @@ unsigned X86RegisterInfo::getNumSupportedRegs(const MachineFunction &MF) const {
   // APX registers (R16-R31)
   //
   // and try to return the minimum number of registers supported by the target.
-  assert((X86::R15WH + 1 == X86 ::YMM0) && (X86::YMM15 + 1 == X86::K0) &&
-         (X86::K6_K7 + 1 == X86::TMMCFG) && (X86::TMM7 + 1 == X86::R16) &&
-         (X86::R31WH + 1 == X86::NUM_TARGET_REGS) &&
-         "Register number may be incorrect");
+  static_assert((X86::R15WH + 1 == X86::YMM0) && (X86::YMM15 + 1 == X86::K0) &&
+                    (X86::K6_K7 + 1 == X86::TMMCFG) &&
+                    (X86::TMM7 + 1 == X86::R16) &&
+                    (X86::R31WH + 1 == X86::NUM_TARGET_REGS),
+                "Register number may be incorrect");
 
   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
   if (ST.hasEGPR())
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index a458b5f9ec8f..4d55a084b730 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -244,7 +244,8 @@ public:
   // TODO: Currently we're always allowing widening on CPUs without VLX,
   // because for many cases we don't have a better option.
   bool canExtendTo512DQ() const {
-    return hasAVX512() && (!hasVLX() || getPreferVectorWidth() >= 512);
+    return hasAVX512() && hasEVEX512() &&
+           (!hasVLX() || getPreferVectorWidth() >= 512);
   }
   bool canExtendTo512BW() const  {
     return hasBWI() && canExtendTo512DQ();
diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index 494dc76a1852..20182fb06037 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -639,7 +639,8 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
   switch (Baseline) {
   default:
     return createStringError(errc::invalid_argument,
-                             "first letter should be 'e', 'i' or 'g'");
+                             "first letter after \'" + Arch.slice(0, 4) +
+                                 "\' should be 'e', 'i' or 'g'");
   case 'e':
   case 'i':
     break;
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index da714c9a7570..fbb83e787f63 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -306,6 +306,10 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV,
       APInt Offset(DL.getIndexTypeSizeInBits(PtrOp->getType()), 0);
       PtrOp = PtrOp->stripAndAccumulateConstantOffsets(
           DL, Offset, /* AllowNonInbounds */ true);
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(PtrOp)) {
+        if (II->getIntrinsicID() == Intrinsic::threadlocal_address)
+          PtrOp = II->getArgOperand(0);
+      }
       if (PtrOp == GV) {
         if (auto *Value = ConstantFoldLoadFromConst(Init, Ty, Offset, DL)) {
           LI->replaceAllUsesWith(Value);
@@ -318,6 +322,9 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV,
     } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U)) { // memset/cpy/mv
       if (getUnderlyingObject(MI->getRawDest()) == GV)
         EraseFromParent(MI);
+    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
+      if (II->getIntrinsicID() == Intrinsic::threadlocal_address)
+        append_range(WorkList, II->users());
     }
   }
 
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 0b3a6931e779..6cbd138842c8 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -252,20 +252,21 @@ static cl::opt<unsigned> PrecentMismatchForStalenessError(
 
 static cl::opt<bool> CallsitePrioritizedInline(
     "sample-profile-prioritized-inline", cl::Hidden,
-
     cl::desc("Use call site prioritized inlining for sample profile loader."
              "Currently only CSSPGO is supported."));
 
 static cl::opt<bool> UsePreInlinerDecision(
     "sample-profile-use-preinliner", cl::Hidden,
-
     cl::desc("Use the preinliner decisions stored in profile context."));
 
 static cl::opt<bool> AllowRecursiveInline(
     "sample-profile-recursive-inline", cl::Hidden,
-
     cl::desc("Allow sample loader inliner to inline recursive calls."));
 
+static cl::opt<bool> RemoveProbeAfterProfileAnnotation(
+    "sample-profile-remove-probe", cl::Hidden, cl::init(false),
+    cl::desc("Remove pseudo-probe after sample profile annotation."));
+
 static cl::opt<std::string> ProfileInlineReplayFile(
     "sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"),
     cl::desc(
@@ -518,6 +519,7 @@ protected:
   void generateMDProfMetadata(Function &F);
   bool rejectHighStalenessProfile(Module &M, ProfileSummaryInfo *PSI,
                                   const SampleProfileMap &Profiles);
+  void removePseudoProbeInsts(Module &M);
 
   /// Map from function name to Function *. Used to find the function from
   /// the function name. If the function name contains suffix, additional
@@ -2127,6 +2129,20 @@ bool SampleProfileLoader::rejectHighStalenessProfile(
   return false;
 }
 
+void SampleProfileLoader::removePseudoProbeInsts(Module &M) {
+  for (auto &F : M) {
+    std::vector<Instruction *> InstsToDel;
+    for (auto &BB : F) {
+      for (auto &I : BB) {
+        if (isa<PseudoProbeInst>(&I))
+          InstsToDel.push_back(&I);
+      }
+    }
+    for (auto *I : InstsToDel)
+      I->eraseFromParent();
+  }
+}
+
 bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
                                       ProfileSummaryInfo *_PSI,
                                       LazyCallGraph &CG) {
@@ -2196,6 +2212,9 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
          notInlinedCallInfo)
       updateProfileCallee(pair.first, pair.second.entryCount);
 
+  if (RemoveProbeAfterProfileAnnotation && FunctionSamples::ProfileIsProbeBased)
+    removePseudoProbeInsts(M);
+
   return retval;
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index e5652458f150..1913ef92c16c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3168,7 +3168,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
-  case Intrinsic::experimental_vector_reverse: {
+  case Intrinsic::vector_reverse: {
     Value *BO0, *BO1, *X, *Y;
     Value *Vec = II->getArgOperand(0);
     if (match(Vec, m_OneUse(m_BinOp(m_Value(BO0), m_Value(BO1))))) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 1064340cb536..f66883de8dd5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -6889,8 +6889,8 @@ static Instruction *foldVectorCmp(CmpInst &Cmp,
     if (auto *I = dyn_cast<Instruction>(V))
       I->copyIRFlags(&Cmp);
     Module *M = Cmp.getModule();
-    Function *F = Intrinsic::getDeclaration(
-        M, Intrinsic::experimental_vector_reverse, V->getType());
+    Function *F =
+        Intrinsic::getDeclaration(M, Intrinsic::vector_reverse, V->getType());
     return CallInst::Create(F, V);
   };
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index aafb4cf6ca6a..db7838bbe3c2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -354,8 +354,9 @@ private:
   }
 
   bool willNotOverflowUnsignedMul(const Value *LHS, const Value *RHS,
-                                  const Instruction &CxtI) const {
-    return computeOverflowForUnsignedMul(LHS, RHS, &CxtI) ==
+                                  const Instruction &CxtI,
+                                  bool IsNSW = false) const {
+    return computeOverflowForUnsignedMul(LHS, RHS, &CxtI, IsNSW) ==
            OverflowResult::NeverOverflows;
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 4ed4c36e21e0..ca1b1921404d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -530,7 +530,7 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
     I.setHasNoSignedWrap(true);
   }
 
-  if (!HasNUW && willNotOverflowUnsignedMul(Op0, Op1, I)) {
+  if (!HasNUW && willNotOverflowUnsignedMul(Op0, Op1, I, I.hasNoSignedWrap())) {
     Changed = true;
     I.setHasNoUnsignedWrap(true);
   }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 117eb7a1dcc9..8818369e7945 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -2537,8 +2537,8 @@ Instruction *InstCombinerImpl::foldVectorSelect(SelectInst &Sel) {
       if (auto *I = dyn_cast<Instruction>(V))
         I->copyIRFlags(&Sel);
       Module *M = Sel.getModule();
-      Function *F = Intrinsic::getDeclaration(
-          M, Intrinsic::experimental_vector_reverse, V->getType());
+      Function *F =
+          Intrinsic::getDeclaration(M, Intrinsic::vector_reverse, V->getType());
       return CallInst::Create(F, V);
     };
 
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 58b2d8e9dec1..7356941be645 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2043,8 +2043,8 @@ Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) {
     if (auto *BO = dyn_cast<BinaryOperator>(V))
       BO->copyIRFlags(&Inst);
     Module *M = Inst.getModule();
-    Function *F = Intrinsic::getDeclaration(
-        M, Intrinsic::experimental_vector_reverse, V->getType());
+    Function *F =
+        Intrinsic::getDeclaration(M, Intrinsic::vector_reverse, V->getType());
     return CallInst::Create(F, V);
   };
 
@@ -2948,6 +2948,14 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     return nullptr;
 
   if (GEP.getNumIndices() == 1) {
+    // We can only preserve inbounds if the original gep is inbounds, the add
+    // is nsw, and the add operands are non-negative.
+    auto CanPreserveInBounds = [&](bool AddIsNSW, Value *Idx1, Value *Idx2) {
+      SimplifyQuery Q = SQ.getWithInstruction(&GEP);
+      return GEP.isInBounds() && AddIsNSW && isKnownNonNegative(Idx1, Q) &&
+             isKnownNonNegative(Idx2, Q);
+    };
+
     // Try to replace ADD + GEP with GEP + GEP.
     Value *Idx1, *Idx2;
     if (match(GEP.getOperand(1),
@@ -2957,10 +2965,15 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // as:
       //   %newptr = getelementptr i32, ptr %ptr, i64 %idx1
       //   %newgep = getelementptr i32, ptr %newptr, i64 %idx2
-      auto *NewPtr = Builder.CreateGEP(GEP.getResultElementType(),
-                                       GEP.getPointerOperand(), Idx1);
-      return GetElementPtrInst::Create(GEP.getResultElementType(), NewPtr,
-                                       Idx2);
+      bool IsInBounds = CanPreserveInBounds(
+          cast<OverflowingBinaryOperator>(GEP.getOperand(1))->hasNoSignedWrap(),
+          Idx1, Idx2);
+      auto *NewPtr =
+          Builder.CreateGEP(GEP.getResultElementType(), GEP.getPointerOperand(),
+                            Idx1, "", IsInBounds);
+      return replaceInstUsesWith(
+          GEP, Builder.CreateGEP(GEP.getResultElementType(), NewPtr, Idx2, "",
+                                 IsInBounds));
     }
     ConstantInt *C;
     if (match(GEP.getOperand(1), m_OneUse(m_SExtLike(m_OneUse(m_NSWAdd(
@@ -2971,12 +2984,17 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // as:
       // %newptr = getelementptr i32, ptr %ptr, i32 %idx1
       // %newgep = getelementptr i32, ptr %newptr, i32 idx2
+      bool IsInBounds = CanPreserveInBounds(
+          /*IsNSW=*/true, Idx1, C);
       auto *NewPtr = Builder.CreateGEP(
           GEP.getResultElementType(), GEP.getPointerOperand(),
-          Builder.CreateSExt(Idx1, GEP.getOperand(1)->getType()));
-      return GetElementPtrInst::Create(
-          GEP.getResultElementType(), NewPtr,
-          Builder.CreateSExt(C, GEP.getOperand(1)->getType()));
+          Builder.CreateSExt(Idx1, GEP.getOperand(1)->getType()), "",
+          IsInBounds);
+      return replaceInstUsesWith(
+          GEP,
+          Builder.CreateGEP(GEP.getResultElementType(), NewPtr,
+                            Builder.CreateSExt(C, GEP.getOperand(1)->getType()),
+                            "", IsInBounds));
     }
   }
 
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 4941f92b94f0..51fc28ef90ef 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -1405,13 +1405,13 @@ SplitBlockPredecessorsImpl(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
   if (OldLatch) {
     BasicBlock *NewLatch = L->getLoopLatch();
     if (NewLatch != OldLatch) {
-      MDNode *MD = OldLatch->getTerminator()->getMetadata("llvm.loop");
-      NewLatch->getTerminator()->setMetadata("llvm.loop", MD);
+      MDNode *MD = OldLatch->getTerminator()->getMetadata(LLVMContext::MD_loop);
+      NewLatch->getTerminator()->setMetadata(LLVMContext::MD_loop, MD);
       // It's still possible that OldLatch is the latch of another inner loop,
       // in which case we do not remove the metadata.
       Loop *IL = LI->getLoopFor(OldLatch);
       if (IL && IL->getLoopLatch() != OldLatch)
-        OldLatch->getTerminator()->setMetadata("llvm.loop", nullptr);
+        OldLatch->getTerminator()->setMetadata(LLVMContext::MD_loop, nullptr);
     }
   }
 
diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index ed0ed345435c..e97506b4bbd9 100644
--- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -1255,7 +1255,7 @@ static void setRetExtAttr(Function &F,
 }
 
 // Modeled after X86TargetLowering::markLibCallAttributes.
-static void markRegisterParameterAttributes(Function *F) {
+void llvm::markRegisterParameterAttributes(Function *F) {
   if (!F->arg_size() || F->isVarArg())
     return;
 
diff --git a/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/llvm/lib/Transforms/Utils/GlobalStatus.cpp
index c5aded3c45f4..b177e048faae 100644
--- a/llvm/lib/Transforms/Utils/GlobalStatus.cpp
+++ b/llvm/lib/Transforms/Utils/GlobalStatus.cpp
@@ -172,9 +172,14 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
           return true;
         GS.StoredType = GlobalStatus::Stored;
       } else if (const auto *CB = dyn_cast<CallBase>(I)) {
-        if (!CB->isCallee(&U))
-          return true;
-        GS.IsLoaded = true;
+        if (CB->getIntrinsicID() == Intrinsic::threadlocal_address) {
+          if (analyzeGlobalAux(I, GS, VisitedUsers))
+            return true;
+        } else {
+          if (!CB->isCallee(&U))
+            return true;
+          GS.IsLoaded = true;
+        }
       } else {
         return true; // Any other non-load instruction might take address!
       }
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 73c5d6367822..e3e09d11ba8c 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1930,10 +1930,12 @@ llvm::hasPartialIVCondition(const Loop &L, unsigned MSSAThreshold,
   if (!TI || !TI->isConditional())
     return {};
 
-  auto *CondI = dyn_cast<CmpInst>(TI->getCondition());
+  auto *CondI = dyn_cast<Instruction>(TI->getCondition());
   // The case with the condition outside the loop should already be handled
   // earlier.
-  if (!CondI || !L.contains(CondI))
+  // Allow CmpInst and TruncInsts as they may be users of load instructions
+  // and have potential for partial unswitching
+  if (!CondI || !isa<CmpInst, TruncInst>(CondI) || !L.contains(CondI))
     return {};
 
   SmallVector<Instruction *> InstToDuplicate;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e2dd62619b01..c44d90f0998e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -460,9 +460,9 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
   // must use intrinsics to interleave.
   if (VecTy->isScalableTy()) {
     VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
-    return Builder.CreateIntrinsic(
-        WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
-        /*FMFSource=*/nullptr, Name);
+    return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
+                                   Vals,
+                                   /*FMFSource=*/nullptr, Name);
   }
 
   // Fixed length. Start by concatenating all vectors into a wide vector.
@@ -2517,9 +2517,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
       SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
       auto *MaskTy =
           VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true);
-      return Builder.CreateIntrinsic(
-          MaskTy, Intrinsic::experimental_vector_interleave2, Ops,
-          /*FMFSource=*/nullptr, "interleaved.mask");
+      return Builder.CreateIntrinsic(MaskTy, Intrinsic::vector_interleave2, Ops,
+                                     /*FMFSource=*/nullptr, "interleaved.mask");
     }
 
     if (!BlockInMask)
@@ -2571,7 +2570,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
         // Scalable vectors cannot use arbitrary shufflevectors (only splats),
         // so must use intrinsics to deinterleave.
         Value *DI = Builder.CreateIntrinsic(
-            Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
+            Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part],
             /*FMFSource=*/nullptr, "strided.vec");
         unsigned J = 0;
         for (unsigned I = 0; I < InterleaveFactor; ++I) {
@@ -6877,11 +6876,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
     // In cases of scalarized and predicated instructions, there will be VF
     // predicated blocks in the vectorized loop. Each branch around these
     // blocks requires also an extract of its vector compare i1 element.
+    // Note that the conditional branch from the loop latch will be replaced by
+    // a single branch controlling the loop, so there is no extra overhead from
+    // scalarization.
     bool ScalarPredicatedBB = false;
     BranchInst *BI = cast<BranchInst>(I);
     if (VF.isVector() && BI->isConditional() &&
         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
-         PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
+         PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
+        BI->getParent() != TheLoop->getLoopLatch())
       ScalarPredicatedBB = true;
 
     if (ScalarPredicatedBB) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f62270fe62eb..e3a1b0d39a4d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2487,12 +2487,12 @@ private:
   /// which exploits values reused across lanes, and arranges the inserts
   /// for ease of later optimization.
   template <typename BVTy, typename ResTy, typename... Args>
-  ResTy processBuildVector(const TreeEntry *E, Args &...Params);
+  ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
 
   /// Create a new vector from a list of scalar values.  Produces a sequence
   /// which exploits values reused across lanes, and arranges the inserts
   /// for ease of later optimization.
-  Value *createBuildVector(const TreeEntry *E);
+  Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
 
   /// Returns the instruction in the bundle, which can be used as a base point
   /// for scheduling. Usually it is the last instruction in the bundle, except
@@ -2556,7 +2556,8 @@ private:
   /// this subtree gets vectorized, we may need to extract the values from the
   /// roots. This method calculates the cost of extracting the values.
   /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
-  InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc) const;
+  InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
+                                Type *ScalarTy) const;
 
   /// Set the Builder insert point to one after the last instruction in
   /// the bundle
@@ -2564,7 +2565,7 @@ private:
 
   /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
   /// specified, the starting vector value is poison.
-  Value *gather(ArrayRef<Value *> VL, Value *Root);
+  Value *gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy);
 
   /// \returns whether the VectorizableTree is fully vectorizable and will
   /// be beneficial even the tree height is tiny.
@@ -7876,6 +7877,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
   bool IsFinalized = false;
   SmallVector<int> CommonMask;
   SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;
+  Type *ScalarTy = nullptr;
   const TargetTransformInfo &TTI;
   InstructionCost Cost = 0;
   SmallDenseSet<Value *> VectorizedVals;
@@ -7905,13 +7907,13 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
   InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
     if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
       return TTI::TCC_Free;
-    auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
+    auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
     InstructionCost GatherCost = 0;
     SmallVector<Value *> Gathers(VL.begin(), VL.end());
     // Improve gather cost for gather of loads, if we can group some of the
     // loads into vector loads.
     InstructionsState S = getSameOpcode(VL, *R.TLI);
-    const unsigned Sz = R.DL->getTypeSizeInBits(VL.front()->getType());
+    const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy);
     unsigned MinVF = R.getMinVF(2 * Sz);
     if (VL.size() > 2 &&
         ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
@@ -7925,7 +7927,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                  }))) &&
         !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
         !isSplat(Gathers)) {
-      InstructionCost BaseCost = R.getGatherCost(Gathers, !Root);
+      InstructionCost BaseCost = R.getGatherCost(Gathers, !Root, ScalarTy);
       SetVector<Value *> VectorizedLoads;
       SmallVector<std::pair<unsigned, LoadsState>> VectorizedStarts;
       SmallVector<unsigned> ScatterVectorized;
@@ -8053,7 +8055,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                                              VecTy, Mask, CostKind);
             }
           } else {
-            GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true);
+            GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true,
+                                          PointerOps.front()->getType());
           }
         }
         if (NeedInsertSubvectorAnalysis) {
@@ -8087,18 +8090,19 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       transform(VL, ShuffleMask.begin(), [](Value *V) {
         return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
       });
-      InstructionCost InsertCost = TTI.getVectorInstrCost(
-          Instruction::InsertElement, VecTy, CostKind, 0,
-          PoisonValue::get(VecTy), *It);
-      return InsertCost +
-             TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy,
-                                ShuffleMask, CostKind, /*Index=*/0,
-                                /*SubTp=*/nullptr, /*Args=*/*It);
+      InstructionCost InsertCost =
+          TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
+                                 PoisonValue::get(VecTy), *It);
+      return InsertCost + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast,
+                                             VecTy, ShuffleMask, CostKind,
+                                             /*Index=*/0, /*SubTp=*/nullptr,
+                                             /*Args=*/*It);
     }
     return GatherCost +
            (all_of(Gathers, IsaPred<UndefValue>)
                 ? TTI::TCC_Free
-                : R.getGatherCost(Gathers, !Root && VL.equals(Gathers)));
+                : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
+                                  ScalarTy));
   };
 
   /// Compute the cost of creating a vector containing the extracted values from
@@ -8118,8 +8122,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
             return Sz;
           return std::max(Sz, VecTy->getNumElements());
         });
-    unsigned NumSrcRegs = TTI.getNumberOfParts(
-        FixedVectorType::get(VL.front()->getType(), NumElts));
+    unsigned NumSrcRegs =
+        TTI.getNumberOfParts(FixedVectorType::get(ScalarTy, NumElts));
     if (NumSrcRegs == 0)
       NumSrcRegs = 1;
     // FIXME: this must be moved to TTI for better estimation.
@@ -8165,17 +8169,16 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       std::optional<TTI::ShuffleKind> RegShuffleKind =
           CheckPerRegistersShuffle(SubMask);
       if (!RegShuffleKind) {
-        Cost += ::getShuffleCost(
-            TTI, *ShuffleKinds[Part],
-            FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);
+        Cost += ::getShuffleCost(TTI, *ShuffleKinds[Part],
+                                 FixedVectorType::get(ScalarTy, NumElts),
+                                 MaskSlice);
         continue;
       }
       if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
           !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
-        Cost += ::getShuffleCost(
-            TTI, *RegShuffleKind,
-            FixedVectorType::get(VL.front()->getType(), EltsPerVector),
-            SubMask);
+        Cost += ::getShuffleCost(TTI, *RegShuffleKind,
+                                 FixedVectorType::get(ScalarTy, EltsPerVector),
+                                 SubMask);
       }
     }
     return Cost;
@@ -8292,6 +8295,48 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     SmallVector<int> CommonMask(Mask.begin(), Mask.end());
     Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
     unsigned CommonVF = Mask.size();
+    InstructionCost ExtraCost = 0;
+    auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
+                                        unsigned VF) -> InstructionCost {
+      if (E.State == TreeEntry::NeedToGather && allConstant(E.Scalars))
+        return TTI::TCC_Free;
+      Type *EScalarTy = E.Scalars.front()->getType();
+      bool IsSigned = true;
+      if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
+        EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
+        IsSigned = It->second.second;
+      }
+      if (EScalarTy != ScalarTy) {
+        unsigned CastOpcode = Instruction::Trunc;
+        unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
+        unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
+        if (DstSz > SrcSz)
+          CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
+        return TTI.getCastInstrCost(CastOpcode,
+                                    FixedVectorType::get(ScalarTy, VF),
+                                    FixedVectorType::get(EScalarTy, VF),
+                                    TTI::CastContextHint::None, CostKind);
+      }
+      return TTI::TCC_Free;
+    };
+    auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
+      if (isa<Constant>(V))
+        return TTI::TCC_Free;
+      auto *VecTy = cast<VectorType>(V->getType());
+      Type *EScalarTy = VecTy->getElementType();
+      if (EScalarTy != ScalarTy) {
+        bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
+        unsigned CastOpcode = Instruction::Trunc;
+        unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
+        unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
+        if (DstSz > SrcSz)
+          CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
+        return TTI.getCastInstrCost(
+            CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
+            VecTy, TTI::CastContextHint::None, CostKind);
+      }
+      return TTI::TCC_Free;
+    };
     if (!V1 && !V2 && !P2.isNull()) {
       // Shuffle 2 entry nodes.
       const TreeEntry *E = P1.get<const TreeEntry *>();
@@ -8318,11 +8363,14 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
           }
         }
         CommonVF = E->Scalars.size();
+        ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
+                     GetNodeMinBWAffectedCost(*E2, CommonVF);
+      } else {
+        ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
+                     GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
       }
-      V1 = Constant::getNullValue(
-          FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
-      V2 = getAllOnesValue(
-          *R.DL, FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
+      V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
+      V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
     } else if (!V1 && P2.isNull()) {
       // Shuffle single entry node.
       const TreeEntry *E = P1.get<const TreeEntry *>();
@@ -8341,8 +8389,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         }
         CommonVF = E->Scalars.size();
       }
-      V1 = Constant::getNullValue(
-          FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
+      ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
+      V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
       // Not identity/broadcast? Try to see if the original vector is better.
       if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
           CommonVF == CommonMask.size() &&
@@ -8359,6 +8407,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       }
     } else if (V1 && P2.isNull()) {
       // Shuffle single vector.
+      ExtraCost += GetValueMinBWAffectedCost(V1);
       CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();
       assert(
           all_of(Mask,
@@ -8385,11 +8434,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         }
         CommonVF = VF;
       }
-      V1 = Constant::getNullValue(
-          FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));
-      V2 = getAllOnesValue(
-          *R.DL,
-          FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));
+      ExtraCost += GetValueMinBWAffectedCost(V1);
+      V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
+      ExtraCost += GetNodeMinBWAffectedCost(
+          *E2, std::min(CommonVF, E2->getVectorFactor()));
+      V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
     } else if (!V1 && V2) {
       // Shuffle vector and tree node.
       unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
@@ -8413,11 +8462,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         }
         CommonVF = VF;
       }
-      V1 = Constant::getNullValue(
-          FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF));
-      V2 = getAllOnesValue(
-          *R.DL,
-          FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF));
+      ExtraCost += GetNodeMinBWAffectedCost(
+          *E1, std::min(CommonVF, E1->getVectorFactor()));
+      V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
+      ExtraCost += GetValueMinBWAffectedCost(V2);
+      V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
     } else {
       assert(V1 && V2 && "Expected both vectors.");
       unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
@@ -8428,30 +8477,33 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                       return Idx < 2 * static_cast<int>(CommonVF);
                     }) &&
              "All elements in mask must be less than 2 * CommonVF.");
+      ExtraCost +=
+          GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
       if (V1->getType() != V2->getType()) {
-        V1 = Constant::getNullValue(FixedVectorType::get(
-            cast<FixedVectorType>(V1->getType())->getElementType(), CommonVF));
-        V2 = getAllOnesValue(
-            *R.DL, FixedVectorType::get(
-                       cast<FixedVectorType>(V1->getType())->getElementType(),
-                       CommonVF));
+        V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
+        V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
+      } else {
+        if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
+          V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
+        if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
+          V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
       }
     }
-    InVectors.front() = Constant::getNullValue(FixedVectorType::get(
-        cast<FixedVectorType>(V1->getType())->getElementType(),
-        CommonMask.size()));
+    InVectors.front() = Constant::getNullValue(
+        FixedVectorType::get(ScalarTy, CommonMask.size()));
     if (InVectors.size() == 2)
       InVectors.pop_back();
-    return BaseShuffleAnalysis::createShuffle<InstructionCost>(
-        V1, V2, CommonMask, Builder);
+    return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
+                           V1, V2, CommonMask, Builder);
   }
 
 public:
-  ShuffleCostEstimator(TargetTransformInfo &TTI,
+  ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI,
                        ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
                        SmallPtrSetImpl<Value *> &CheckedExtracts)
-      : TTI(TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()),
-        R(R), CheckedExtracts(CheckedExtracts) {}
+      : ScalarTy(ScalarTy), TTI(TTI),
+        VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
+        CheckedExtracts(CheckedExtracts) {}
   Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
                         ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
                         unsigned NumParts, bool &UseVecBaseAsInput) {
@@ -8547,7 +8599,7 @@ public:
     if (NumParts != 1 && UniqueBases.size() != 1) {
       UseVecBaseAsInput = true;
       VecBase = Constant::getNullValue(
-          FixedVectorType::get(VL.front()->getType(), CommonMask.size()));
+          FixedVectorType::get(ScalarTy, CommonMask.size()));
     }
     return VecBase;
   }
@@ -8575,8 +8627,7 @@ public:
       return;
     }
     assert(!CommonMask.empty() && "Expected non-empty common mask.");
-    auto *MaskVecTy =
-        FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
+    auto *MaskVecTy = FixedVectorType::get(ScalarTy, Mask.size());
     unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
     if (NumParts == 0 || NumParts >= Mask.size())
       NumParts = 1;
@@ -8593,8 +8644,7 @@ public:
       return;
     }
     assert(!CommonMask.empty() && "Expected non-empty common mask.");
-    auto *MaskVecTy =
-        FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
+    auto *MaskVecTy = FixedVectorType::get(ScalarTy, Mask.size());
     unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
     if (NumParts == 0 || NumParts >= Mask.size())
       NumParts = 1;
@@ -8694,7 +8744,7 @@ public:
     return ConstantVector::getSplat(
         ElementCount::getFixed(
             cast<FixedVectorType>(Root->getType())->getNumElements()),
-        getAllOnesValue(*R.DL, VL.front()->getType()));
+        getAllOnesValue(*R.DL, ScalarTy));
   }
   InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
   /// Finalize emission of the shuffles.
@@ -8840,7 +8890,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     if (isa<InsertElementInst>(VL[0]))
       return InstructionCost::getInvalid();
     return processBuildVector<ShuffleCostEstimator, InstructionCost>(
-        E, *TTI, VectorizedVals, *this, CheckedExtracts);
+        E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
   }
   InstructionCost CommonCost = 0;
   SmallVector<int> Mask;
@@ -10880,12 +10930,8 @@ BoUpSLP::isGatherShuffledEntry(
   return Res;
 }
 
-InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
-                                       bool ForPoisonSrc) const {
-  // Find the type of the operands in VL.
-  Type *ScalarTy = VL[0]->getType();
-  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
-    ScalarTy = SI->getValueOperand()->getType();
+InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
+                                       Type *ScalarTy) const {
   auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
   bool DuplicateNonConst = false;
   // Find the cost of inserting/extracting values from the vector.
@@ -10896,6 +10942,11 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
   constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   InstructionCost Cost;
   auto EstimateInsertCost = [&](unsigned I, Value *V) {
+    if (V->getType() != ScalarTy) {
+      Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
+                                    TTI::CastContextHint::None, CostKind);
+      V = nullptr;
+    }
     if (!ForPoisonSrc)
       Cost +=
           TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
@@ -11123,7 +11174,7 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
   Builder.SetCurrentDebugLocation(Front->getDebugLoc());
 }
 
-Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
+Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy) {
   // List of instructions/lanes from current block and/or the blocks which are
   // part of the current loop. These instructions will be inserted at the end to
   // make it possible to optimize loops and hoist invariant instructions out of
@@ -11149,14 +11200,11 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
   auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
                                       Type *Ty) {
     Value *Scalar = V;
-    if (cast<VectorType>(Vec->getType())->getElementType() != Ty) {
-      assert(V->getType()->isIntegerTy() && Ty->isIntegerTy() &&
+    if (Scalar->getType() != Ty) {
+      assert(Scalar->getType()->isIntegerTy() && Ty->isIntegerTy() &&
              "Expected integer types only.");
-      Vec = Builder.CreateIntCast(
-          Vec,
-          VectorType::get(Ty,
-                          cast<VectorType>(Vec->getType())->getElementCount()),
-          !isKnownNonNegative(Vec, SimplifyQuery(*DL)));
+      Scalar = Builder.CreateIntCast(
+          Scalar, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
     }
 
     Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
@@ -11184,10 +11232,7 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
     }
     return Vec;
   };
-  Value *Val0 =
-      isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
-  Type *ScalarTy = Val0->getType();
-  FixedVectorType *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+  auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
   Value *Vec = Root ? Root : PoisonValue::get(VecTy);
   SmallVector<int> NonConsts;
   // Insert constant values at first.
@@ -11266,6 +11311,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
   /// resulting shuffle and the second operand sets to be the newly added
   /// operand. The \p CommonMask is transformed in the proper way after that.
   SmallVector<Value *, 2> InVectors;
+  Type *ScalarTy = nullptr;
   IRBuilderBase &Builder;
   BoUpSLP &R;
 
@@ -11376,9 +11422,20 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
         CommonMask[Idx] = Idx;
   }
 
+  /// Cast value \p V to the vector type with the same number of elements, but
+  /// the base type \p ScalarTy.
+  Value *castToScalarTyElem(Value *V) {
+    auto *VecTy = cast<VectorType>(V->getType());
+    if (VecTy->getElementType() == ScalarTy)
+      return V;
+    return Builder.CreateIntCast(
+        V, VectorType::get(ScalarTy, VecTy->getElementCount()),
+        !isKnownNonNegative(V, SimplifyQuery(*R.DL)));
+  }
+
 public:
-  ShuffleInstructionBuilder(IRBuilderBase &Builder, BoUpSLP &R)
-      : Builder(Builder), R(R) {}
+  ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
+      : ScalarTy(ScalarTy), Builder(Builder), R(R) {}
 
   /// Adjusts extractelements after reusing them.
   Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
@@ -11417,8 +11474,10 @@ public:
         continue;
       R.eraseInstruction(EI);
     }
-    if (NumParts == 1 || UniqueBases.size() == 1)
+    if (NumParts == 1 || UniqueBases.size() == 1) {
+      VecBase = castToScalarTyElem(VecBase);
       return VecBase;
+    }
     UseVecBaseAsInput = true;
     auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
       for (auto [I, Idx] : enumerate(Mask))
@@ -11455,6 +11514,7 @@ public:
                "Expected vectors of the same size.");
         PrevSize = Size;
 #endif // NDEBUG
+        VecOp = castToScalarTyElem(VecOp);
         Bases[SubMask[I] < Size ? 0 : 1] = VecOp;
       }
       if (!Bases.front())
@@ -11510,10 +11570,10 @@ public:
       return std::nullopt;
     // Postpone gather emission, will be emitted after the end of the
     // process to keep correct order.
-    auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(),
-                                       E->getVectorFactor());
+    auto *ResVecTy = FixedVectorType::get(ScalarTy, E->getVectorFactor());
     return Builder.CreateAlignedLoad(
-        VecTy, PoisonValue::get(PointerType::getUnqual(VecTy->getContext())),
+        ResVecTy,
+        PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
         MaybeAlign());
   }
   /// Adds 2 input vectors (in form of tree entries) and the mask for their
@@ -11529,6 +11589,8 @@ public:
   /// Adds 2 input vectors and the mask for their shuffling.
   void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
     assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
+    V1 = castToScalarTyElem(V1);
+    V2 = castToScalarTyElem(V2);
     if (InVectors.empty()) {
       InVectors.push_back(V1);
       InVectors.push_back(V2);
@@ -11556,6 +11618,7 @@ public:
   }
   /// Adds another one input vector and the mask for the shuffling.
   void add(Value *V1, ArrayRef<int> Mask, bool = false) {
+    V1 = castToScalarTyElem(V1);
     if (InVectors.empty()) {
       if (!isa<FixedVectorType>(V1->getType())) {
         V1 = createShuffle(V1, nullptr, CommonMask);
@@ -11619,7 +11682,7 @@ public:
   }
   Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
                 Value *Root = nullptr) {
-    return R.gather(VL, Root);
+    return R.gather(VL, Root, ScalarTy);
   }
   Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
   /// Finalize emission of the shuffles.
@@ -11719,7 +11782,8 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
     }
     if (IsSameVE) {
       auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
-        ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
+        ShuffleInstructionBuilder ShuffleBuilder(
+            cast<VectorType>(V->getType())->getElementType(), Builder, *this);
         ShuffleBuilder.add(V, Mask);
         return ShuffleBuilder.finalize(std::nullopt);
       };
@@ -11794,7 +11858,8 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
 }
 
 template <typename BVTy, typename ResTy, typename... Args>
-ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
+ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
+                                  Args &...Params) {
   assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");
   unsigned VF = E->getVectorFactor();
 
@@ -11842,7 +11907,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
     }
     return true;
   };
-  BVTy ShuffleBuilder(Params...);
+  BVTy ShuffleBuilder(ScalarTy, Params...);
   ResTy Res = ResTy();
   SmallVector<int> Mask;
   SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
@@ -11851,7 +11916,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
   bool UseVecBaseAsInput = false;
   SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
   SmallVector<SmallVector<const TreeEntry *>> Entries;
-  Type *ScalarTy = GatheredScalars.front()->getType();
+  Type *OrigScalarTy = GatheredScalars.front()->getType();
   auto *VecTy = FixedVectorType::get(ScalarTy, GatheredScalars.size());
   unsigned NumParts = TTI->getNumberOfParts(VecTy);
   if (NumParts == 0 || NumParts >= GatheredScalars.size())
@@ -11886,7 +11951,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
               GatheredScalars.size() != VF) {
             Resized = true;
             GatheredScalars.append(VF - GatheredScalars.size(),
-                                   PoisonValue::get(ScalarTy));
+                                   PoisonValue::get(OrigScalarTy));
           }
       }
     }
@@ -11946,12 +12011,12 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
               });
             }))
           GatheredScalars.append(VF - GatheredScalars.size(),
-                                 PoisonValue::get(ScalarTy));
+                                 PoisonValue::get(OrigScalarTy));
       }
       // Remove shuffled elements from list of gathers.
       for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
         if (Mask[I] != PoisonMaskElem)
-          GatheredScalars[I] = PoisonValue::get(ScalarTy);
+          GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
       }
     }
   }
@@ -11962,7 +12027,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
     // such sequences.
     bool IsSplat = IsRootPoison && isSplat(Scalars) &&
                    (Scalars.size() > 2 || Scalars.front() == Scalars.back());
-    Scalars.append(VF - Scalars.size(), PoisonValue::get(ScalarTy));
+    Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
     SmallVector<int> UndefPos;
     DenseMap<Value *, unsigned> UniquePositions;
     // Gather unique non-const values and all constant values.
@@ -11984,7 +12049,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
       ++NumNonConsts;
       SinglePos = I;
       Value *OrigV = V;
-      Scalars[I] = PoisonValue::get(ScalarTy);
+      Scalars[I] = PoisonValue::get(OrigScalarTy);
       if (IsSplat) {
         Scalars.front() = OrigV;
         ReuseMask[I] = 0;
@@ -12000,7 +12065,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
         ReuseMask.assign(VF, PoisonMaskElem);
         std::swap(Scalars.front(), Scalars[SinglePos]);
         if (!UndefPos.empty() && UndefPos.front() == 0)
-          Scalars.front() = UndefValue::get(ScalarTy);
+          Scalars.front() = UndefValue::get(OrigScalarTy);
       }
       ReuseMask[SinglePos] = SinglePos;
     } else if (!UndefPos.empty() && IsSplat) {
@@ -12030,7 +12095,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
           // Replace the undef by the poison, in the mask it is replaced by
           // non-poisoned scalar already.
           if (I != Pos)
-            Scalars[I] = PoisonValue::get(ScalarTy);
+            Scalars[I] = PoisonValue::get(OrigScalarTy);
         }
       } else {
         // Replace undefs by the poisons, emit broadcast and then emit
@@ -12038,7 +12103,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
         for (int I : UndefPos) {
           ReuseMask[I] = PoisonMaskElem;
           if (isa<UndefValue>(Scalars[I]))
-            Scalars[I] = PoisonValue::get(ScalarTy);
+            Scalars[I] = PoisonValue::get(OrigScalarTy);
         }
         NeedFreeze = true;
       }
@@ -12093,9 +12158,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
         IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
       } else {
         IsUsedInExpr = false;
-        ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get(
-                               ScalarTy, GatheredScalars.size())),
-                           ExtractMask, /*ForExtracts=*/true);
+        ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
+                           /*ForExtracts=*/true);
       }
     }
     if (!GatherShuffles.empty()) {
@@ -12176,9 +12240,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
     // contains only constant to build final vector and then shuffle.
     for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
       if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
-        NonConstants[I] = PoisonValue::get(ScalarTy);
+        NonConstants[I] = PoisonValue::get(OrigScalarTy);
       else
-        GatheredScalars[I] = PoisonValue::get(ScalarTy);
+        GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
     }
     // Generate constants for final shuffle and build a mask for them.
     if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
@@ -12224,9 +12288,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
   return Res;
 }
 
-Value *BoUpSLP::createBuildVector(const TreeEntry *E) {
-  return processBuildVector<ShuffleInstructionBuilder, Value *>(E, Builder,
-                                                                *this);
+Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
+  return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
+                                                                Builder, *this);
 }
 
 Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
@@ -12239,18 +12303,28 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
     return E->VectorizedValue;
   }
 
+  Value *V = E->Scalars.front();
+  Type *ScalarTy = V->getType();
+  if (auto *Store = dyn_cast<StoreInst>(V))
+    ScalarTy = Store->getValueOperand()->getType();
+  else if (auto *IE = dyn_cast<InsertElementInst>(V))
+    ScalarTy = IE->getOperand(1)->getType();
+  auto It = MinBWs.find(E);
+  if (It != MinBWs.end())
+    ScalarTy = IntegerType::get(F->getContext(), It->second.first);
+  auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
   if (E->State == TreeEntry::NeedToGather) {
     // Set insert point for non-reduction initial nodes.
     if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
       setInsertPointAfterBundle(E);
-    Value *Vec = createBuildVector(E);
+    Value *Vec = createBuildVector(E, ScalarTy);
     E->VectorizedValue = Vec;
     return Vec;
   }
 
   bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
   auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) {
-    ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
+    ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
     if (E->getOpcode() == Instruction::Store) {
       ArrayRef<int> Mask =
           ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
@@ -12271,14 +12345,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
   unsigned ShuffleOrOp =
       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
   Instruction *VL0 = E->getMainOp();
-  Type *ScalarTy = VL0->getType();
-  if (auto *Store = dyn_cast<StoreInst>(VL0))
-    ScalarTy = Store->getValueOperand()->getType();
-  else if (auto *IE = dyn_cast<InsertElementInst>(VL0))
-    ScalarTy = IE->getOperand(1)->getType();
-  auto It = MinBWs.find(E);
-  if (It != MinBWs.end())
-    ScalarTy = IntegerType::get(F->getContext(), It->second.first);
   auto GetOperandSignedness = [&](unsigned Idx) {
     const TreeEntry *OpE = getOperandEntry(E, Idx);
     bool IsSigned = false;
@@ -12291,7 +12357,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       });
     return IsSigned;
   };
-  auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
       assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
@@ -13166,7 +13231,8 @@ Value *BoUpSLP::vectorizeTree(
     auto *TE = const_cast<TreeEntry *>(E);
     if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
       if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
-              TE->UserTreeIndices.front().EdgeIdx)))
+              TE->UserTreeIndices.front().EdgeIdx)) &&
+          VecTE->isSame(TE->Scalars))
         // Found gather node which is absolutely the same as one of the
         // vectorized nodes. It may happen after reordering.
         continue;
@@ -13544,7 +13610,8 @@ Value *BoUpSLP::vectorizeTree(
       else
         CombinedMask2[I] = Mask[I] - VF;
     }
-    ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
+    ShuffleInstructionBuilder ShuffleBuilder(
+        cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
     ShuffleBuilder.add(V1, CombinedMask1);
     if (V2)
       ShuffleBuilder.add(V2, CombinedMask2);
@@ -14579,13 +14646,27 @@ bool BoUpSLP::collectValuesToDemote(
       return false;
     bool Res = all_of(
         E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
-    // Gather demoted constant operands.
-    if (Res && E.State == TreeEntry::NeedToGather &&
-        all_of(E.Scalars, IsaPred<Constant>))
-      ToDemote.push_back(E.Idx);
+    // Demote gathers.
+    if (Res && E.State == TreeEntry::NeedToGather) {
+      // Check possible extractelement instructions bases and final vector
+      // length.
+      SmallPtrSet<Value *, 4> UniqueBases;
+      for (Value *V : E.Scalars) {
+        auto *EE = dyn_cast<ExtractElementInst>(V);
+        if (!EE)
+          continue;
+        UniqueBases.insert(EE->getVectorOperand());
+      }
+      const unsigned VF = E.Scalars.size();
+      Type *OrigScalarTy = E.Scalars.front()->getType();
+      if (UniqueBases.size() <= 2 ||
+          TTI->getNumberOfParts(FixedVectorType::get(OrigScalarTy, VF)) ==
+              TTI->getNumberOfParts(FixedVectorType::get(
+                  IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
+        ToDemote.push_back(E.Idx);
+    }
     return Res;
   };
-  // TODO: improve handling of gathered values and others.
   if (E.State == TreeEntry::NeedToGather || !Visited.insert(&E).second ||
       any_of(E.Scalars, [&](Value *V) {
         return all_of(V->users(), [&](User *U) {
diff --git a/llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll b/llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll
index 836a028ad6aa..f491b086107a 100644
--- a/llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll
@@ -7,58 +7,58 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @vector_reverse() #0{
 ; CHECK-LABEL: 'vector_reverse'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = call <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <32 x i8> @llvm.experimental.vector.reverse.v32i8(<32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %3 = call <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %4 = call <16 x i16> @llvm.experimental.vector.reverse.v16i16(<16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %6 = call <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <4 x i64> @llvm.experimental.vector.reverse.v4i64(<4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = call <16 x half> @llvm.experimental.vector.reverse.v16f16(<16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %11 = call <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %12 = call <8 x float> @llvm.experimental.vector.reverse.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = call <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = call <4 x double> @llvm.experimental.vector.reverse.v4f64(<4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %15 = call <8 x bfloat> @llvm.experimental.vector.reverse.v8bf16(<8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %16 = call <16 x bfloat> @llvm.experimental.vector.reverse.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = call <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <32 x i8> @llvm.vector.reverse.v32i8(<32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %3 = call <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %4 = call <16 x i16> @llvm.vector.reverse.v16i16(<16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %6 = call <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <4 x i64> @llvm.vector.reverse.v4i64(<4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <8 x half> @llvm.vector.reverse.v8f16(<8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = call <16 x half> @llvm.vector.reverse.v16f16(<16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %11 = call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %12 = call <8 x float> @llvm.vector.reverse.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = call <2 x double> @llvm.vector.reverse.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = call <4 x double> @llvm.vector.reverse.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %15 = call <8 x bfloat> @llvm.vector.reverse.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %16 = call <16 x bfloat> @llvm.vector.reverse.v16bf16(<16 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 
-  call <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8> undef)
-  call <32 x i8> @llvm.experimental.vector.reverse.v32i8(<32 x i8> undef)
-  call <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16> undef)
-  call <16 x i16> @llvm.experimental.vector.reverse.v16i16(<16 x i16> undef)
-  call <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32> undef)
-  call <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32> undef)
-  call <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64> undef)
-  call <4 x i64> @llvm.experimental.vector.reverse.v4i64(<4 x i64> undef)
-  call <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half> undef)
-  call <16 x half> @llvm.experimental.vector.reverse.v16f16(<16 x half> undef)
-  call <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float> undef)
-  call <8 x float> @llvm.experimental.vector.reverse.v8f32(<8 x float> undef)
-  call <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double> undef)
-  call <4 x double> @llvm.experimental.vector.reverse.v4f64(<4 x double> undef)
-  call <8 x bfloat> @llvm.experimental.vector.reverse.v8bf16(<8 x bfloat> undef)
-  call <16 x bfloat> @llvm.experimental.vector.reverse.v16bf16(<16 x bfloat> undef)
+  call <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8> undef)
+  call <32 x i8> @llvm.vector.reverse.v32i8(<32 x i8> undef)
+  call <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16> undef)
+  call <16 x i16> @llvm.vector.reverse.v16i16(<16 x i16> undef)
+  call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> undef)
+  call <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32> undef)
+  call <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64> undef)
+  call <4 x i64> @llvm.vector.reverse.v4i64(<4 x i64> undef)
+  call <8 x half> @llvm.vector.reverse.v8f16(<8 x half> undef)
+  call <16 x half> @llvm.vector.reverse.v16f16(<16 x half> undef)
+  call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> undef)
+  call <8 x float> @llvm.vector.reverse.v8f32(<8 x float> undef)
+  call <2 x double> @llvm.vector.reverse.v2f64(<2 x double> undef)
+  call <4 x double> @llvm.vector.reverse.v4f64(<4 x double> undef)
+  call <8 x bfloat> @llvm.vector.reverse.v8bf16(<8 x bfloat> undef)
+  call <16 x bfloat> @llvm.vector.reverse.v16bf16(<16 x bfloat> undef)
   ret void
 }
 
 attributes #0 = { "target-features"="+sve,+bf16" }
-declare <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8>)
-declare <32 x i8> @llvm.experimental.vector.reverse.v32i8(<32 x i8>)
-declare <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16>)
-declare <16 x i16> @llvm.experimental.vector.reverse.v16i16(<16 x i16>)
-declare <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32>)
-declare <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32>)
-declare <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64>)
-declare <4 x i64> @llvm.experimental.vector.reverse.v4i64(<4 x i64>)
-declare <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half>)
-declare <16 x half> @llvm.experimental.vector.reverse.v16f16(<16 x half>)
-declare <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float>)
-declare <8 x float> @llvm.experimental.vector.reverse.v8f32(<8 x float>)
-declare <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double>)
-declare <4 x double> @llvm.experimental.vector.reverse.v4f64(<4 x double>)
-declare <8 x bfloat> @llvm.experimental.vector.reverse.v8bf16(<8 x bfloat>)
-declare <16 x bfloat> @llvm.experimental.vector.reverse.v16bf16(<16 x bfloat>)
+declare <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8>)
+declare <32 x i8> @llvm.vector.reverse.v32i8(<32 x i8>)
+declare <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16>)
+declare <16 x i16> @llvm.vector.reverse.v16i16(<16 x i16>)
+declare <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32>)
+declare <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32>)
+declare <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64>)
+declare <4 x i64> @llvm.vector.reverse.v4i64(<4 x i64>)
+declare <8 x half> @llvm.vector.reverse.v8f16(<8 x half>)
+declare <16 x half> @llvm.vector.reverse.v16f16(<16 x half>)
+declare <4 x float> @llvm.vector.reverse.v4f32(<4 x float>)
+declare <8 x float> @llvm.vector.reverse.v8f32(<8 x float>)
+declare <2 x double> @llvm.vector.reverse.v2f64(<2 x double>)
+declare <4 x double> @llvm.vector.reverse.v4f64(<4 x double>)
+declare <8 x bfloat> @llvm.vector.reverse.v8bf16(<8 x bfloat>)
+declare <16 x bfloat> @llvm.vector.reverse.v16bf16(<16 x bfloat>)
diff --git a/llvm/test/Analysis/CostModel/AArch64/splice.ll b/llvm/test/Analysis/CostModel/AArch64/splice.ll
index f5afdff41b1d..1d76a4838cee 100644
--- a/llvm/test/Analysis/CostModel/AArch64/splice.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/splice.ll
@@ -5,96 +5,96 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @vector_splice() #0 {
 ; CHECK-LABEL: 'vector_splice'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v16i8 = call <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8> zeroinitializer, <16 x i8> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v32i8 = call <32 x i8> @llvm.experimental.vector.splice.v32i8(<32 x i8> zeroinitializer, <32 x i8> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2i16 = call <2 x i16> @llvm.experimental.vector.splice.v2i16(<2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4i16 = call <4 x i16> @llvm.experimental.vector.splice.v4i16(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v8i16 = call <8 x i16> @llvm.experimental.vector.splice.v8i16(<8 x i16> zeroinitializer, <8 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v16i16 = call <16 x i16> @llvm.experimental.vector.splice.v16i16(<16 x i16> zeroinitializer, <16 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4i32 = call <4 x i32> @llvm.experimental.vector.splice.v4i32(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v8i32 = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2i64 = call <2 x i64> @llvm.experimental.vector.splice.v2i64(<2 x i64> zeroinitializer, <2 x i64> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v4i64 = call <4 x i64> @llvm.experimental.vector.splice.v4i64(<4 x i64> zeroinitializer, <4 x i64> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2f16 = call <2 x half> @llvm.experimental.vector.splice.v2f16(<2 x half> zeroinitializer, <2 x half> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4f16 = call <4 x half> @llvm.experimental.vector.splice.v4f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v8f16 = call <8 x half> @llvm.experimental.vector.splice.v8f16(<8 x half> zeroinitializer, <8 x half> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v16f16 = call <16 x half> @llvm.experimental.vector.splice.v16f16(<16 x half> zeroinitializer, <16 x half> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2f32 = call <2 x float> @llvm.experimental.vector.splice.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4f32 = call <4 x float> @llvm.experimental.vector.splice.v4f32(<4 x float> zeroinitializer, <4 x float> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v8f32 = call <8 x float> @llvm.experimental.vector.splice.v8f32(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2f64 = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> zeroinitializer, <2 x double> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v4f64 = call <4 x double> @llvm.experimental.vector.splice.v4f64(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2bf16 = call <2 x bfloat> @llvm.experimental.vector.splice.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4bf16 = call <4 x bfloat> @llvm.experimental.vector.splice.v4bf16(<4 x bfloat> zeroinitializer, <4 x bfloat> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v8bf16 = call <8 x bfloat> @llvm.experimental.vector.splice.v8bf16(<8 x bfloat> zeroinitializer, <8 x bfloat> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v16bf16 = call <16 x bfloat> @llvm.experimental.vector.splice.v16bf16(<16 x bfloat> zeroinitializer, <16 x bfloat> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v16i1 = call <16 x i1> @llvm.experimental.vector.splice.v16i1(<16 x i1> zeroinitializer, <16 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v8i1 = call <8 x i1> @llvm.experimental.vector.splice.v8i1(<8 x i1> zeroinitializer, <8 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4i1 = call <4 x i1> @llvm.experimental.vector.splice.v4i1(<4 x i1> zeroinitializer, <4 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2i1 = call <2 x i1> @llvm.experimental.vector.splice.v2i1(<2 x i1> zeroinitializer, <2 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %splice.v2i128 = call <2 x i128> @llvm.experimental.vector.splice.v2i128(<2 x i128> zeroinitializer, <2 x i128> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v16i8 = call <16 x i8> @llvm.vector.splice.v16i8(<16 x i8> zeroinitializer, <16 x i8> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v32i8 = call <32 x i8> @llvm.vector.splice.v32i8(<32 x i8> zeroinitializer, <32 x i8> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2i16 = call <2 x i16> @llvm.vector.splice.v2i16(<2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4i16 = call <4 x i16> @llvm.vector.splice.v4i16(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v8i16 = call <8 x i16> @llvm.vector.splice.v8i16(<8 x i16> zeroinitializer, <8 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v16i16 = call <16 x i16> @llvm.vector.splice.v16i16(<16 x i16> zeroinitializer, <16 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4i32 = call <4 x i32> @llvm.vector.splice.v4i32(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v8i32 = call <8 x i32> @llvm.vector.splice.v8i32(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2i64 = call <2 x i64> @llvm.vector.splice.v2i64(<2 x i64> zeroinitializer, <2 x i64> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v4i64 = call <4 x i64> @llvm.vector.splice.v4i64(<4 x i64> zeroinitializer, <4 x i64> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2f16 = call <2 x half> @llvm.vector.splice.v2f16(<2 x half> zeroinitializer, <2 x half> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4f16 = call <4 x half> @llvm.vector.splice.v4f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v8f16 = call <8 x half> @llvm.vector.splice.v8f16(<8 x half> zeroinitializer, <8 x half> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v16f16 = call <16 x half> @llvm.vector.splice.v16f16(<16 x half> zeroinitializer, <16 x half> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2f32 = call <2 x float> @llvm.vector.splice.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4f32 = call <4 x float> @llvm.vector.splice.v4f32(<4 x float> zeroinitializer, <4 x float> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v8f32 = call <8 x float> @llvm.vector.splice.v8f32(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2f64 = call <2 x double> @llvm.vector.splice.v2f64(<2 x double> zeroinitializer, <2 x double> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v4f64 = call <4 x double> @llvm.vector.splice.v4f64(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2bf16 = call <2 x bfloat> @llvm.vector.splice.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4bf16 = call <4 x bfloat> @llvm.vector.splice.v4bf16(<4 x bfloat> zeroinitializer, <4 x bfloat> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v8bf16 = call <8 x bfloat> @llvm.vector.splice.v8bf16(<8 x bfloat> zeroinitializer, <8 x bfloat> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v16bf16 = call <16 x bfloat> @llvm.vector.splice.v16bf16(<16 x bfloat> zeroinitializer, <16 x bfloat> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v16i1 = call <16 x i1> @llvm.vector.splice.v16i1(<16 x i1> zeroinitializer, <16 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v8i1 = call <8 x i1> @llvm.vector.splice.v8i1(<8 x i1> zeroinitializer, <8 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4i1 = call <4 x i1> @llvm.vector.splice.v4i1(<4 x i1> zeroinitializer, <4 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2i1 = call <2 x i1> @llvm.vector.splice.v2i1(<2 x i1> zeroinitializer, <2 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %splice.v2i128 = call <2 x i128> @llvm.vector.splice.v2i128(<2 x i128> zeroinitializer, <2 x i128> zeroinitializer, i32 1)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %splice.v16i8 = call <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8> zeroinitializer, <16 x i8> zeroinitializer, i32 1)
-  %splice.v32i8 = call <32 x i8> @llvm.experimental.vector.splice.v32i8(<32 x i8> zeroinitializer, <32 x i8> zeroinitializer, i32 1)
-  %splice.v2i16 = call <2 x i16> @llvm.experimental.vector.splice.v2i16(<2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i32 1)
-  %splice.v4i16 = call <4 x i16> @llvm.experimental.vector.splice.v4i16(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, i32 1)
-  %splice.v8i16 = call <8 x i16> @llvm.experimental.vector.splice.v8i16(<8 x i16> zeroinitializer, <8 x i16> zeroinitializer, i32 1)
-  %splice.v16i16 = call <16 x i16> @llvm.experimental.vector.splice.v16i16(<16 x i16> zeroinitializer, <16 x i16> zeroinitializer, i32 1)
-  %splice.v4i32 = call <4 x i32> @llvm.experimental.vector.splice.v4i32(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 1)
-  %splice.v8i32 = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 1)
-  %splice.v2i64 = call <2 x i64> @llvm.experimental.vector.splice.v2i64(<2 x i64> zeroinitializer, <2 x i64> zeroinitializer, i32 1)
-  %splice.v4i64 = call <4 x i64> @llvm.experimental.vector.splice.v4i64(<4 x i64> zeroinitializer, <4 x i64> zeroinitializer, i32 1)
-  %splice.v2f16 = call <2 x half> @llvm.experimental.vector.splice.v2f16(<2 x half> zeroinitializer, <2 x half> zeroinitializer, i32 1)
-  %splice.v4f16 = call <4 x half> @llvm.experimental.vector.splice.v4f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, i32 1)
-  %splice.v8f16 = call <8 x half> @llvm.experimental.vector.splice.v8f16(<8 x half> zeroinitializer, <8 x half> zeroinitializer, i32 1)
-  %splice.v16f16 = call <16 x half> @llvm.experimental.vector.splice.v16f16(<16 x half> zeroinitializer, <16 x half> zeroinitializer, i32 1)
-  %splice.v2f32 = call <2 x float> @llvm.experimental.vector.splice.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, i32 1)
-  %splice.v4f32 = call <4 x float> @llvm.experimental.vector.splice.v4f32(<4 x float> zeroinitializer, <4 x float> zeroinitializer, i32 1)
-  %splice.v8f32 = call <8 x float> @llvm.experimental.vector.splice.v8f32(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i32 1)
-  %splice.v2f64 = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> zeroinitializer, <2 x double> zeroinitializer, i32 1)
-  %splice.v4f64 = call <4 x double> @llvm.experimental.vector.splice.v4f64(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i32 1)
-  %splice.v2bf16 = call <2 x bfloat> @llvm.experimental.vector.splice.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> zeroinitializer, i32 1)
-  %splice.v4bf16 = call <4 x bfloat> @llvm.experimental.vector.splice.v4bf16(<4 x bfloat> zeroinitializer, <4 x bfloat> zeroinitializer, i32 1)
-  %splice.v8bf16 = call <8 x bfloat> @llvm.experimental.vector.splice.v8bf16(<8 x bfloat> zeroinitializer, <8 x bfloat> zeroinitializer, i32 1)
-  %splice.v16bf16 = call <16 x bfloat> @llvm.experimental.vector.splice.v16bf16(<16 x bfloat> zeroinitializer, <16 x bfloat> zeroinitializer, i32 1)
-  %splice.v16i1 = call <16 x i1> @llvm.experimental.vector.splice.v16i1(<16 x i1> zeroinitializer, <16 x i1> zeroinitializer, i32 1)
-  %splice.v8i1 =  call <8 x i1> @llvm.experimental.vector.splice.v8i1(<8 x i1> zeroinitializer, <8 x i1> zeroinitializer, i32 1)
-  %splice.v4i1 = call <4 x i1> @llvm.experimental.vector.splice.v4i1(<4 x i1> zeroinitializer, <4 x i1> zeroinitializer, i32 1)
-  %splice.v2i1 = call <2 x i1> @llvm.experimental.vector.splice.v2i1(<2 x i1> zeroinitializer, <2 x i1> zeroinitializer, i32 1)
-  %splice.v2i128 = call <2 x i128> @llvm.experimental.vector.splice.v2i128(<2 x i128> zeroinitializer, <2 x i128> zeroinitializer, i32 1)
+  %splice.v16i8 = call <16 x i8> @llvm.vector.splice.v16i8(<16 x i8> zeroinitializer, <16 x i8> zeroinitializer, i32 1)
+  %splice.v32i8 = call <32 x i8> @llvm.vector.splice.v32i8(<32 x i8> zeroinitializer, <32 x i8> zeroinitializer, i32 1)
+  %splice.v2i16 = call <2 x i16> @llvm.vector.splice.v2i16(<2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i32 1)
+  %splice.v4i16 = call <4 x i16> @llvm.vector.splice.v4i16(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, i32 1)
+  %splice.v8i16 = call <8 x i16> @llvm.vector.splice.v8i16(<8 x i16> zeroinitializer, <8 x i16> zeroinitializer, i32 1)
+  %splice.v16i16 = call <16 x i16> @llvm.vector.splice.v16i16(<16 x i16> zeroinitializer, <16 x i16> zeroinitializer, i32 1)
+  %splice.v4i32 = call <4 x i32> @llvm.vector.splice.v4i32(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 1)
+  %splice.v8i32 = call <8 x i32> @llvm.vector.splice.v8i32(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 1)
+  %splice.v2i64 = call <2 x i64> @llvm.vector.splice.v2i64(<2 x i64> zeroinitializer, <2 x i64> zeroinitializer, i32 1)
+  %splice.v4i64 = call <4 x i64> @llvm.vector.splice.v4i64(<4 x i64> zeroinitializer, <4 x i64> zeroinitializer, i32 1)
+  %splice.v2f16 = call <2 x half> @llvm.vector.splice.v2f16(<2 x half> zeroinitializer, <2 x half> zeroinitializer, i32 1)
+  %splice.v4f16 = call <4 x half> @llvm.vector.splice.v4f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, i32 1)
+  %splice.v8f16 = call <8 x half> @llvm.vector.splice.v8f16(<8 x half> zeroinitializer, <8 x half> zeroinitializer, i32 1)
+  %splice.v16f16 = call <16 x half> @llvm.vector.splice.v16f16(<16 x half> zeroinitializer, <16 x half> zeroinitializer, i32 1)
+  %splice.v2f32 = call <2 x float> @llvm.vector.splice.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, i32 1)
+  %splice.v4f32 = call <4 x float> @llvm.vector.splice.v4f32(<4 x float> zeroinitializer, <4 x float> zeroinitializer, i32 1)
+  %splice.v8f32 = call <8 x float> @llvm.vector.splice.v8f32(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i32 1)
+  %splice.v2f64 = call <2 x double> @llvm.vector.splice.v2f64(<2 x double> zeroinitializer, <2 x double> zeroinitializer, i32 1)
+  %splice.v4f64 = call <4 x double> @llvm.vector.splice.v4f64(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i32 1)
+  %splice.v2bf16 = call <2 x bfloat> @llvm.vector.splice.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> zeroinitializer, i32 1)
+  %splice.v4bf16 = call <4 x bfloat> @llvm.vector.splice.v4bf16(<4 x bfloat> zeroinitializer, <4 x bfloat> zeroinitializer, i32 1)
+  %splice.v8bf16 = call <8 x bfloat> @llvm.vector.splice.v8bf16(<8 x bfloat> zeroinitializer, <8 x bfloat> zeroinitializer, i32 1)
+  %splice.v16bf16 = call <16 x bfloat> @llvm.vector.splice.v16bf16(<16 x bfloat> zeroinitializer, <16 x bfloat> zeroinitializer, i32 1)
+  %splice.v16i1 = call <16 x i1> @llvm.vector.splice.v16i1(<16 x i1> zeroinitializer, <16 x i1> zeroinitializer, i32 1)
+  %splice.v8i1 =  call <8 x i1> @llvm.vector.splice.v8i1(<8 x i1> zeroinitializer, <8 x i1> zeroinitializer, i32 1)
+  %splice.v4i1 = call <4 x i1> @llvm.vector.splice.v4i1(<4 x i1> zeroinitializer, <4 x i1> zeroinitializer, i32 1)
+  %splice.v2i1 = call <2 x i1> @llvm.vector.splice.v2i1(<2 x i1> zeroinitializer, <2 x i1> zeroinitializer, i32 1)
+  %splice.v2i128 = call <2 x i128> @llvm.vector.splice.v2i128(<2 x i128> zeroinitializer, <2 x i128> zeroinitializer, i32 1)
   ret void
 }
 
-declare <2 x i1> @llvm.experimental.vector.splice.v2i1(<2 x i1>, <2 x i1>, i32)
-declare <4 x i1> @llvm.experimental.vector.splice.v4i1(<4 x i1>, <4 x i1>, i32)
-declare <8 x i1> @llvm.experimental.vector.splice.v8i1(<8 x i1>, <8 x i1>, i32)
-declare <16 x i1> @llvm.experimental.vector.splice.v16i1(<16 x i1>, <16 x i1>, i32)
-declare <2 x i8> @llvm.experimental.vector.splice.v2i8(<2 x i8>, <2 x i8>, i32)
-declare <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8>, <16 x i8>, i32)
-declare <32 x i8> @llvm.experimental.vector.splice.v32i8(<32 x i8>, <32 x i8>, i32)
-declare <2 x i16> @llvm.experimental.vector.splice.v2i16(<2 x i16>, <2 x i16>, i32)
-declare <4 x i16> @llvm.experimental.vector.splice.v4i16(<4 x i16>, <4 x i16>, i32)
-declare <8 x i16> @llvm.experimental.vector.splice.v8i16(<8 x i16>, <8 x i16>, i32)
-declare <16 x i16> @llvm.experimental.vector.splice.v16i16(<16 x i16>, <16 x i16>, i32)
-declare <4 x i32> @llvm.experimental.vector.splice.v4i32(<4 x i32>, <4 x i32>, i32)
-declare <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32>, <8 x i32>, i32)
-declare <2 x i64> @llvm.experimental.vector.splice.v2i64(<2 x i64>, <2 x i64>, i32)
-declare <4 x i64> @llvm.experimental.vector.splice.v4i64(<4 x i64>, <4 x i64>, i32)
-declare <2 x half> @llvm.experimental.vector.splice.v2f16(<2 x half>, <2 x half>, i32)
-declare <4 x half> @llvm.experimental.vector.splice.v4f16(<4 x half>, <4 x half>, i32)
-declare <8 x half> @llvm.experimental.vector.splice.v8f16(<8 x half>, <8 x half>, i32)
-declare <16 x half> @llvm.experimental.vector.splice.v16f16(<16 x half>, <16 x half>, i32)
-declare <2 x bfloat> @llvm.experimental.vector.splice.v2bf16(<2 x bfloat>, <2 x bfloat>, i32)
-declare <4 x bfloat> @llvm.experimental.vector.splice.v4bf16(<4 x bfloat>, <4 x bfloat>, i32)
-declare <8 x bfloat> @llvm.experimental.vector.splice.v8bf16(<8 x bfloat>, <8 x bfloat>, i32)
-declare <16 x bfloat> @llvm.experimental.vector.splice.v16bf16(<16 x bfloat>, <16 x bfloat>, i32)
-declare <2 x float> @llvm.experimental.vector.splice.v2f32(<2 x float>, <2 x float>, i32)
-declare <4 x float> @llvm.experimental.vector.splice.v4f32(<4 x float>, <4 x float>, i32)
-declare <8 x float> @llvm.experimental.vector.splice.v8f32(<8 x float>, <8 x float>, i32)
-declare <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float>, <16 x float>, i32)
-declare <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double>, <2 x double>, i32)
-declare <4 x double> @llvm.experimental.vector.splice.v4f64(<4 x double>, <4 x double>, i32)
-declare <2 x i128> @llvm.experimental.vector.splice.v2i128(<2 x i128>, <2 x i128>, i32)
+declare <2 x i1> @llvm.vector.splice.v2i1(<2 x i1>, <2 x i1>, i32)
+declare <4 x i1> @llvm.vector.splice.v4i1(<4 x i1>, <4 x i1>, i32)
+declare <8 x i1> @llvm.vector.splice.v8i1(<8 x i1>, <8 x i1>, i32)
+declare <16 x i1> @llvm.vector.splice.v16i1(<16 x i1>, <16 x i1>, i32)
+declare <2 x i8> @llvm.vector.splice.v2i8(<2 x i8>, <2 x i8>, i32)
+declare <16 x i8> @llvm.vector.splice.v16i8(<16 x i8>, <16 x i8>, i32)
+declare <32 x i8> @llvm.vector.splice.v32i8(<32 x i8>, <32 x i8>, i32)
+declare <2 x i16> @llvm.vector.splice.v2i16(<2 x i16>, <2 x i16>, i32)
+declare <4 x i16> @llvm.vector.splice.v4i16(<4 x i16>, <4 x i16>, i32)
+declare <8 x i16> @llvm.vector.splice.v8i16(<8 x i16>, <8 x i16>, i32)
+declare <16 x i16> @llvm.vector.splice.v16i16(<16 x i16>, <16 x i16>, i32)
+declare <4 x i32> @llvm.vector.splice.v4i32(<4 x i32>, <4 x i32>, i32)
+declare <8 x i32> @llvm.vector.splice.v8i32(<8 x i32>, <8 x i32>, i32)
+declare <2 x i64> @llvm.vector.splice.v2i64(<2 x i64>, <2 x i64>, i32)
+declare <4 x i64> @llvm.vector.splice.v4i64(<4 x i64>, <4 x i64>, i32)
+declare <2 x half> @llvm.vector.splice.v2f16(<2 x half>, <2 x half>, i32)
+declare <4 x half> @llvm.vector.splice.v4f16(<4 x half>, <4 x half>, i32)
+declare <8 x half> @llvm.vector.splice.v8f16(<8 x half>, <8 x half>, i32)
+declare <16 x half> @llvm.vector.splice.v16f16(<16 x half>, <16 x half>, i32)
+declare <2 x bfloat> @llvm.vector.splice.v2bf16(<2 x bfloat>, <2 x bfloat>, i32)
+declare <4 x bfloat> @llvm.vector.splice.v4bf16(<4 x bfloat>, <4 x bfloat>, i32)
+declare <8 x bfloat> @llvm.vector.splice.v8bf16(<8 x bfloat>, <8 x bfloat>, i32)
+declare <16 x bfloat> @llvm.vector.splice.v16bf16(<16 x bfloat>, <16 x bfloat>, i32)
+declare <2 x float> @llvm.vector.splice.v2f32(<2 x float>, <2 x float>, i32)
+declare <4 x float> @llvm.vector.splice.v4f32(<4 x float>, <4 x float>, i32)
+declare <8 x float> @llvm.vector.splice.v8f32(<8 x float>, <8 x float>, i32)
+declare <16 x float> @llvm.vector.splice.v16f32(<16 x float>, <16 x float>, i32)
+declare <2 x double> @llvm.vector.splice.v2f64(<2 x double>, <2 x double>, i32)
+declare <4 x double> @llvm.vector.splice.v4f64(<4 x double>, <4 x double>, i32)
+declare <2 x i128> @llvm.vector.splice.v2i128(<2 x i128>, <2 x i128>, i32)
 
 attributes #0 = { "target-features"="+bf16" }
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index 7ce3021b0093..15c278b060c9 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -270,122 +270,122 @@ declare <vscale x 4 x i32> @llvm.cttz.nxv4i32(<vscale x 4 x i32>, i1)
 
 define void @vector_reverse() #0 {
 ; CHECK-LABEL: 'vector_reverse'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.experimental.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.experimental.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.experimental.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.experimental.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'vector_reverse'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.experimental.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.experimental.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.experimental.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.experimental.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 
-  %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-  %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-  %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-  %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-  %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-  %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-  %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-  %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-  %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-  %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-  %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
-  %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
-  %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
-  %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
-  %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
-  %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
-  %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
-  %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
-  %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
-  %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.experimental.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
-  %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.experimental.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
-  %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.experimental.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
-  %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.experimental.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
-  %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-  %reverse_nxv8i1 =  call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-  %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-  %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+  %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+  %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+  %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
+  %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
+  %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+  %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+  %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+  %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+  %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+  %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+  %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
+  %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
+  %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
+  %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
+  %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
+  %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
+  %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
+  %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
+  %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
+  %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
+  %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
+  %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
+  %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
+  %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+  %reverse_nxv8i1 =  call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+  %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+  %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
   ret void
 }
-declare <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8>)
-declare <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8>)
-declare <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16>)
-declare <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16>)
-declare <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32>)
-declare <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64>)
-declare <vscale x 2 x half> @llvm.experimental.vector.reverse.nxv2f16(<vscale x 2 x half>)
-declare <vscale x 4 x half> @llvm.experimental.vector.reverse.nxv4f16(<vscale x 4 x half>)
-declare <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half>)
-declare <vscale x 16 x half> @llvm.experimental.vector.reverse.nxv16f16(<vscale x 16 x half>)
-declare <vscale x 2 x float> @llvm.experimental.vector.reverse.nxv2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.experimental.vector.reverse.nxv8f32(<vscale x 8 x float>)
-declare <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double>)
-declare <vscale x 2 x bfloat> @llvm.experimental.vector.reverse.nxv2bf16(<vscale x 2 x bfloat>)
-declare <vscale x 4 x bfloat> @llvm.experimental.vector.reverse.nxv4bf16(<vscale x 4 x bfloat>)
-declare <vscale x 8 x bfloat> @llvm.experimental.vector.reverse.nxv8bf16(<vscale x 8 x bfloat>)
-declare <vscale x 16 x bfloat> @llvm.experimental.vector.reverse.nxv16bf16(<vscale x 16 x bfloat>)
-declare <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1>)
-declare <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1>)
-declare <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1>)
-declare <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1>)
+declare <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8>)
+declare <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8>)
+declare <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16>)
+declare <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16>)
+declare <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16>)
+declare <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16>)
+declare <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32>)
+declare <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64>)
+declare <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64>)
+declare <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half>)
+declare <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half>)
+declare <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half>)
+declare <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half>)
+declare <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float>)
+declare <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float>)
+declare <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat>)
+declare <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat>)
+declare <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1>)
+declare <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1>)
+declare <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1>)
+declare <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1>)
 
 define void @unsupported_fp_ops(<vscale x 4 x float> %vec, i32 %extraarg) {
 ; CHECK-LABEL: 'unsupported_fp_ops'
@@ -450,236 +450,236 @@ declare <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float>)
 
 define void @vector_splice() #0 {
 ; CHECK-LABEL: 'vector_splice'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.experimental.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.experimental.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.experimental.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.experimental.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv16i8_neg = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv32i8_neg = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i16_neg = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2i16_neg = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4i16_neg = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8i16_neg = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16i16_neg = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4i32_neg = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv8i32_neg = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i64_neg = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2i64_neg = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv4i64_neg = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f16_neg = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2f16_neg = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4f16_neg = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8f16_neg = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16f16_neg = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f32_neg = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2f32_neg = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4f32_neg = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv8f32_neg = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f64_neg = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2f64_neg = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv4f64_neg = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1bf16_neg = call <vscale x 1 x bfloat> @llvm.experimental.vector.splice.nxv1bf16(<vscale x 1 x bfloat> zeroinitializer, <vscale x 1 x bfloat> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2bf16_neg = call <vscale x 2 x bfloat> @llvm.experimental.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4bf16_neg = call <vscale x 4 x bfloat> @llvm.experimental.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8bf16_neg = call <vscale x 8 x bfloat> @llvm.experimental.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16bf16_neg = call <vscale x 16 x bfloat> @llvm.experimental.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv16i1_neg = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv8i1_neg = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv4i1_neg = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv2i1_neg = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i1_neg = call <vscale x 1 x i1> @llvm.experimental.vector.splice.nxv1i1(<vscale x 1 x i1> zeroinitializer, <vscale x 1 x i1> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2f16 = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4f16 = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8f16 = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16f16 = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2f32 = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4f32 = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8f32 = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2f64 = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4f64 = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv16i8_neg = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv32i8_neg = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i16_neg = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2i16_neg = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4i16_neg = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8i16_neg = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16i16_neg = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4i32_neg = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv8i32_neg = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i64_neg = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2i64_neg = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv4i64_neg = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f16_neg = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2f16_neg = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4f16_neg = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8f16_neg = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16f16_neg = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f32_neg = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2f32_neg = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4f32_neg = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv8f32_neg = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f64_neg = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2f64_neg = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv4f64_neg = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1bf16_neg = call <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat> zeroinitializer, <vscale x 1 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2bf16_neg = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4bf16_neg = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8bf16_neg = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16bf16_neg = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv16i1_neg = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv8i1_neg = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv4i1_neg = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv2i1_neg = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i1_neg = call <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1> zeroinitializer, <vscale x 1 x i1> zeroinitializer, i32 -1)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'vector_splice'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.experimental.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.experimental.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.experimental.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.experimental.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i8_neg = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv32i8_neg = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i16_neg = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i16_neg = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i16_neg = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i16_neg = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i16_neg = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i32_neg = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i32_neg = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i64_neg = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i64_neg = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i64_neg = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f16_neg = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f16_neg = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f16_neg = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8f16_neg = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16f16_neg = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f32_neg = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f32_neg = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f32_neg = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8f32_neg = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f64_neg = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f64_neg = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f64_neg = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1bf16_neg = call <vscale x 1 x bfloat> @llvm.experimental.vector.splice.nxv1bf16(<vscale x 1 x bfloat> zeroinitializer, <vscale x 1 x bfloat> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2bf16_neg = call <vscale x 2 x bfloat> @llvm.experimental.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4bf16_neg = call <vscale x 4 x bfloat> @llvm.experimental.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8bf16_neg = call <vscale x 8 x bfloat> @llvm.experimental.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16bf16_neg = call <vscale x 16 x bfloat> @llvm.experimental.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i1_neg = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i1_neg = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i1_neg = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i1_neg = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i1_neg = call <vscale x 1 x i1> @llvm.experimental.vector.splice.nxv1i1(<vscale x 1 x i1> zeroinitializer, <vscale x 1 x i1> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f16 = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f16 = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8f16 = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16f16 = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f32 = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f32 = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8f32 = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f64 = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f64 = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i8_neg = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv32i8_neg = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i16_neg = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i16_neg = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i16_neg = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i16_neg = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i16_neg = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i32_neg = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i32_neg = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i64_neg = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i64_neg = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i64_neg = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f16_neg = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f16_neg = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f16_neg = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8f16_neg = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16f16_neg = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f32_neg = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f32_neg = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f32_neg = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8f32_neg = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f64_neg = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f64_neg = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f64_neg = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1bf16_neg = call <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat> zeroinitializer, <vscale x 1 x bfloat> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2bf16_neg = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4bf16_neg = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8bf16_neg = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16bf16_neg = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i1_neg = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i1_neg = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i1_neg = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i1_neg = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i1_neg = call <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1> zeroinitializer, <vscale x 1 x i1> zeroinitializer, i32 -1)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 
-  %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
-  %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
-  %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
-  %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
-  %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
-  %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
-  %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
-  %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
-  %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
-  %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
-  %splice_nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 1)
-  %splice_nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 1)
-  %splice_nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 1)
-  %splice_nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 1)
-  %splice_nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 1)
-  %splice_nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 1)
-  %splice_nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 1)
-  %splice_nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 1)
-  %splice_nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 1)
-  %splice_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.experimental.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 1)
-  %splice_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.experimental.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 1)
-  %splice_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.experimental.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 1)
-  %splice_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.experimental.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 1)
-  %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
-  %splice_nxv8i1 =  call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
-  %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
-  %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
+  %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
+  %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
+  %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
+  %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
+  %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
+  %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
+  %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
+  %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
+  %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
+  %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
+  %splice_nxv2f16 = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 1)
+  %splice_nxv4f16 = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 1)
+  %splice_nxv8f16 = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 1)
+  %splice_nxv16f16 = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 1)
+  %splice_nxv2f32 = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 1)
+  %splice_nxv4f32 = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 1)
+  %splice_nxv8f32 = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 1)
+  %splice_nxv2f64 = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 1)
+  %splice_nxv4f64 = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 1)
+  %splice_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 1)
+  %splice_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 1)
+  %splice_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 1)
+  %splice_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 1)
+  %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
+  %splice_nxv8i1 =  call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
+  %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
+  %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
 ;; negative Index
-  %splice_nxv16i8_neg = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
-  %splice_nxv32i8_neg = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
-  %splice_nxv1i16_neg = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
-  %splice_nxv2i16_neg = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
-  %splice_nxv4i16_neg = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
-  %splice_nxv8i16_neg = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
-  %splice_nxv16i16_neg = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
-  %splice_nxv4i32_neg = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
-  %splice_nxv8i32_neg = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
-  %splice_nxv1i64_neg= call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
-  %splice_nxv2i64_neg= call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
-  %splice_nxv4i64_neg = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
-  %splice_nxv1f16_neg = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
-  %splice_nxv2f16_neg = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
-  %splice_nxv4f16_neg = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
-  %splice_nxv8f16_neg = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
-  %splice_nxv16f16_neg = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
-  %splice_nxv1f32_neg = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
-  %splice_nxv2f32_neg = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
-  %splice_nxv4f32_neg = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
-  %splice_nxv8f32_neg = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
-  %splice_nxv1f64_neg = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
-  %splice_nxv2f64_neg = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
-  %splice_nxv4f64_neg = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
-  %splice_nxv1bf16_neg = call <vscale x 1 x bfloat> @llvm.experimental.vector.splice.nxv1bf16(<vscale x 1 x bfloat> zeroinitializer, <vscale x 1 x bfloat> zeroinitializer, i32 -1)
-  %splice_nxv2bf16_neg = call <vscale x 2 x bfloat> @llvm.experimental.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 -1)
-  %splice_nxv4bf16_neg = call <vscale x 4 x bfloat> @llvm.experimental.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
-  %splice_nxv8bf16_neg = call <vscale x 8 x bfloat> @llvm.experimental.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
-  %splice_nxv16bf16_neg = call <vscale x 16 x bfloat> @llvm.experimental.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
-  %splice_nxv16i1_neg = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 -1)
-  %splice_nxv8i1_neg =  call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 -1)
-  %splice_nxv4i1_neg = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 -1)
-  %splice_nxv2i1_neg = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 -1)
-  %splice_nxv1i1_neg = call <vscale x 1 x i1> @llvm.experimental.vector.splice.nxv1i1(<vscale x 1 x i1> zeroinitializer, <vscale x 1 x i1> zeroinitializer, i32 -1)
+  %splice_nxv16i8_neg = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
+  %splice_nxv32i8_neg = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
+  %splice_nxv1i16_neg = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
+  %splice_nxv2i16_neg = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
+  %splice_nxv4i16_neg = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
+  %splice_nxv8i16_neg = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
+  %splice_nxv16i16_neg = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
+  %splice_nxv4i32_neg = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
+  %splice_nxv8i32_neg = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
+  %splice_nxv1i64_neg= call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
+  %splice_nxv2i64_neg= call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
+  %splice_nxv4i64_neg = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
+  %splice_nxv1f16_neg = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
+  %splice_nxv2f16_neg = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
+  %splice_nxv4f16_neg = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
+  %splice_nxv8f16_neg = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
+  %splice_nxv16f16_neg = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
+  %splice_nxv1f32_neg = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
+  %splice_nxv2f32_neg = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
+  %splice_nxv4f32_neg = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
+  %splice_nxv8f32_neg = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
+  %splice_nxv1f64_neg = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
+  %splice_nxv2f64_neg = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
+  %splice_nxv4f64_neg = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
+  %splice_nxv1bf16_neg = call <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat> zeroinitializer, <vscale x 1 x bfloat> zeroinitializer, i32 -1)
+  %splice_nxv2bf16_neg = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 -1)
+  %splice_nxv4bf16_neg = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
+  %splice_nxv8bf16_neg = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
+  %splice_nxv16bf16_neg = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
+  %splice_nxv16i1_neg = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 -1)
+  %splice_nxv8i1_neg =  call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 -1)
+  %splice_nxv4i1_neg = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 -1)
+  %splice_nxv2i1_neg = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 -1)
+  %splice_nxv1i1_neg = call <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1> zeroinitializer, <vscale x 1 x i1> zeroinitializer, i32 -1)
   ret void
 }
 
-declare <vscale x 1 x i1> @llvm.experimental.vector.splice.nxv1i1(<vscale x 1 x i1>, <vscale x 1 x i1>, i32)
-declare <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
-declare <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
-declare <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
-declare <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
-declare <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, i32)
-declare <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
-declare <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
-declare <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, i32)
-declare <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, i32)
-declare <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, i32)
-declare <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32)
-declare <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
-declare <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, i32)
-declare <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
-declare <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
-declare <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, i32)
-declare <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
-declare <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i32)
-declare <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, i32)
-declare <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
-declare <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
-declare <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
-declare <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>, i32)
-declare <vscale x 1 x bfloat> @llvm.experimental.vector.splice.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, i32)
-declare <vscale x 2 x bfloat> @llvm.experimental.vector.splice.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, i32)
-declare <vscale x 4 x bfloat> @llvm.experimental.vector.splice.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, i32)
-declare <vscale x 8 x bfloat> @llvm.experimental.vector.splice.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
-declare <vscale x 16 x bfloat> @llvm.experimental.vector.splice.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>, i32)
-declare <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float>, <vscale x 1 x float>, i32)
-declare <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
-declare <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
-declare <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>, i32)
-declare <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
-declare <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double>, <vscale x 1 x double>, i32)
-declare <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
-declare <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>, i32)
+declare <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1>, <vscale x 1 x i1>, i32)
+declare <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
+declare <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, i32)
+declare <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
+declare <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
+declare <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, i32)
+declare <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, i32)
+declare <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, i32)
+declare <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, i32)
+declare <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+declare <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
+declare <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, i32)
+declare <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
+declare <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i32)
+declare <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, i32)
+declare <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
+declare <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
+declare <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
+declare <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>, i32)
+declare <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, i32)
+declare <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, i32)
+declare <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, i32)
+declare <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
+declare <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>, i32)
+declare <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float>, <vscale x 1 x float>, i32)
+declare <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
+declare <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
+declare <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>, i32)
+declare <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
+declare <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double>, <vscale x 1 x double>, i32)
+declare <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
+declare <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>, i32)
 
 define void @get_lane_mask() #0 {
 ; CHECK-LABEL: 'get_lane_mask'
diff --git a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
index 306277e46fa5..1dde88f366a3 100644
--- a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-vector-bits-min=128 < %s | FileCheck %s
 ; Check that we don't crash querying costs when vectors are not enabled.
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64
 
@@ -252,8 +252,8 @@ define i32 @fdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16 = fdiv <2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F16 = fdiv <4 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F16 = fdiv <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F16 = fdiv <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32F16 = fdiv <32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fdiv <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = fdiv <32 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fdiv <vscale x 1 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fdiv <vscale x 2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fdiv <vscale x 4 x half> undef, undef
@@ -263,8 +263,8 @@ define i32 @fdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F32 = fdiv <1 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = fdiv <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = fdiv <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = fdiv <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = fdiv <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fdiv <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fdiv <16 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fdiv <vscale x 1 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fdiv <vscale x 2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F32 = fdiv <vscale x 4 x float> undef, undef
@@ -272,8 +272,8 @@ define i32 @fdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV16F32 = fdiv <vscale x 16 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = fdiv <1 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = fdiv <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = fdiv <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = fdiv <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fdiv <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fdiv <8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fdiv <vscale x 1 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F64 = fdiv <vscale x 2 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F64 = fdiv <vscale x 4 x double> undef, undef
@@ -332,8 +332,8 @@ define i32 @frem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F16 = frem <2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F16 = frem <4 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F16 = frem <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16F16 = frem <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32F16 = frem <32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16F16 = frem <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %V32F16 = frem <32 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16 = frem <vscale x 1 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16 = frem <vscale x 2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16 = frem <vscale x 4 x half> undef, undef
@@ -343,8 +343,8 @@ define i32 @frem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = frem <1 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = frem <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F32 = frem <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F32 = frem <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16F32 = frem <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = frem <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16F32 = frem <16 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F32 = frem <vscale x 1 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F32 = frem <vscale x 2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F32 = frem <vscale x 4 x float> undef, undef
@@ -352,8 +352,8 @@ define i32 @frem() {
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F32 = frem <vscale x 16 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = frem <1 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = frem <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = frem <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = frem <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F64 = frem <8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F64 = frem <vscale x 1 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F64 = frem <vscale x 2 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F64 = frem <vscale x 4 x double> undef, undef
@@ -492,8 +492,8 @@ define i32 @fcopysign() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = call <32 x half> @llvm.copysign.v32f16(<32 x half> undef, <32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32F16 = call <32 x half> @llvm.copysign.v32f16(<32 x half> undef, <32 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = call <vscale x 2 x half> @llvm.copysign.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = call <vscale x 4 x half> @llvm.copysign.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef)
@@ -503,8 +503,8 @@ define i32 @fcopysign() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.copysign.v1f32(<1 x float> undef, <1 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.copysign.v2f32(<2 x float> undef, <2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = call <vscale x 1 x float> @llvm.copysign.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = call <vscale x 2 x float> @llvm.copysign.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F32 = call <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef)
@@ -512,8 +512,8 @@ define i32 @fcopysign() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16F32 = call <vscale x 16 x float> @llvm.copysign.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.copysign.v1f64(<1 x double> undef, <1 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = call <vscale x 1 x double> @llvm.copysign.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F64 = call <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F64 = call <vscale x 4 x double> @llvm.copysign.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef)
@@ -572,8 +572,8 @@ define i32 @fma() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = call <8 x half> @llvm.fma.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = call <32 x half> @llvm.fma.v32f16(<32 x half> undef, <32 x half> undef, <32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32F16 = call <32 x half> @llvm.fma.v32f16(<32 x half> undef, <32 x half> undef, <32 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = call <vscale x 1 x half> @llvm.fma.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = call <vscale x 2 x half> @llvm.fma.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef, <vscale x 2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = call <vscale x 4 x half> @llvm.fma.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
@@ -583,8 +583,8 @@ define i32 @fma() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.fma.v1f32(<1 x float> undef, <1 x float> undef, <1 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = call <vscale x 1 x float> @llvm.fma.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef, <vscale x 1 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = call <vscale x 2 x float> @llvm.fma.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F32 = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
@@ -592,8 +592,8 @@ define i32 @fma() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16F32 = call <vscale x 16 x float> @llvm.fma.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.fma.v1f64(<1 x double> undef, <1 x double> undef, <1 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = call <vscale x 1 x double> @llvm.fma.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef, <vscale x 1 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F64 = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F64 = call <vscale x 4 x double> @llvm.fma.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef)
@@ -651,15 +651,15 @@ define void @fmuladd() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %6 = call <8 x half> @llvm.fmuladd.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %7 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %11 = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %10 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %11 = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %12 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %13 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %14 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %15 = call <16 x double> @llvm.fmuladd.v16f64(<16 x double> undef, <16 x double> undef, <16 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %13 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = call <16 x double> @llvm.fmuladd.v16f64(<16 x double> undef, <16 x double> undef, <16 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <vscale x 1 x half> @llvm.fmuladd.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <vscale x 2 x half> @llvm.fmuladd.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef, <vscale x 2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <vscale x 4 x half> @llvm.fmuladd.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/arith-int.ll b/llvm/test/Analysis/CostModel/RISCV/arith-int.ll
index 00f2cd7b63a4..b4afbb513166 100644
--- a/llvm/test/Analysis/CostModel/RISCV/arith-int.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/arith-int.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-fixed-length-vector-lmul-max=1 < %s | FileCheck %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mcpu=sifive-x280 -riscv-v-fixed-length-vector-lmul-max=1 < %s | FileCheck %s --check-prefix=SIFIVE-X280
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mcpu=sifive-x280 < %s | FileCheck %s --check-prefix=SIFIVE-X280
 ; Check that we don't crash querying costs when vectors are not enabled.
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64
 
@@ -709,8 +709,8 @@ define i32 @udiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = udiv <2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = udiv <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = udiv <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = udiv <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = udiv <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = udiv <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = udiv <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I16 = udiv <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I16 = udiv <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I16 = udiv <vscale x 4 x i16> undef, undef
@@ -720,8 +720,8 @@ define i32 @udiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = udiv <1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = udiv <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = udiv <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = udiv <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = udiv <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = udiv <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = udiv <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I32 = udiv <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I32 = udiv <vscale x 2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I32 = udiv <vscale x 4 x i32> undef, undef
@@ -729,8 +729,8 @@ define i32 @udiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I32 = udiv <vscale x 16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = udiv <1 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = udiv <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = udiv <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = udiv <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = udiv <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = udiv <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I64 = udiv <vscale x 1 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I64 = udiv <vscale x 2 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I64 = udiv <vscale x 4 x i64> undef, undef
@@ -825,8 +825,8 @@ define i32 @urem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = urem <2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = urem <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = urem <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = urem <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = urem <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = urem <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = urem <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I16 = urem <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I16 = urem <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I16 = urem <vscale x 4 x i16> undef, undef
@@ -836,8 +836,8 @@ define i32 @urem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = urem <1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = urem <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = urem <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = urem <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = urem <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = urem <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = urem <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I32 = urem <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I32 = urem <vscale x 2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I32 = urem <vscale x 4 x i32> undef, undef
@@ -845,8 +845,8 @@ define i32 @urem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I32 = urem <vscale x 16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = urem <1 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = urem <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = urem <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = urem <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = urem <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = urem <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I64 = urem <vscale x 1 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I64 = urem <vscale x 2 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I64 = urem <vscale x 4 x i64> undef, undef
@@ -941,8 +941,8 @@ define i32 @sdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = sdiv <2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = sdiv <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = sdiv <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = sdiv <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = sdiv <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = sdiv <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = sdiv <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I16 = sdiv <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I16 = sdiv <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I16 = sdiv <vscale x 4 x i16> undef, undef
@@ -952,8 +952,8 @@ define i32 @sdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = sdiv <1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = sdiv <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = sdiv <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = sdiv <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = sdiv <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = sdiv <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = sdiv <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I32 = sdiv <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I32 = sdiv <vscale x 2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I32 = sdiv <vscale x 4 x i32> undef, undef
@@ -961,8 +961,8 @@ define i32 @sdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I32 = sdiv <vscale x 16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = sdiv <1 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = sdiv <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = sdiv <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = sdiv <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = sdiv <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = sdiv <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I64 = sdiv <vscale x 1 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I64 = sdiv <vscale x 2 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I64 = sdiv <vscale x 4 x i64> undef, undef
@@ -1057,8 +1057,8 @@ define i32 @srem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = srem <2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = srem <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = srem <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = srem <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = srem <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = srem <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = srem <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I16 = srem <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I16 = srem <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I16 = srem <vscale x 4 x i16> undef, undef
@@ -1068,8 +1068,8 @@ define i32 @srem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = srem <1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = srem <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = srem <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = srem <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = srem <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = srem <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = srem <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I32 = srem <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I32 = srem <vscale x 2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I32 = srem <vscale x 4 x i32> undef, undef
@@ -1077,8 +1077,8 @@ define i32 @srem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I32 = srem <vscale x 16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = srem <1 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = srem <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = srem <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = srem <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = srem <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = srem <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I64 = srem <vscale x 1 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I64 = srem <vscale x 2 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I64 = srem <vscale x 4 x i64> undef, undef
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
index 7cc7cff0e6e8..e068ab638d3a 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
@@ -78,148 +78,148 @@ declare <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x
 
 define void @vector_reverse() {
 ; CHECK-LABEL: 'vector_reverse'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 166 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.experimental.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 332 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.experimental.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 166 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 332 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'vector_reverse'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.experimental.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.experimental.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-  %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-  %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-  %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-  %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-  %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-  %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-  %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-  %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-  %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-  %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
-  %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.experimental.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
-  %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.experimental.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
-  %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-  %reverse_nxv8i1 =  call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-  %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-  %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+  %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+  %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+  %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
+  %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
+  %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+  %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+  %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+  %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+  %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+  %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+  %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
+  %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
+  %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
+  %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+  %reverse_nxv8i1 =  call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+  %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+  %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
   ret void
 }
 
-declare <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8>)
-declare <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8>)
-declare <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16>)
-declare <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16>)
-declare <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32>)
-declare <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64>)
-declare <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64>)
-declare <vscale x 16 x i64> @llvm.experimental.vector.reverse.nxv16i64(<vscale x 16 x i64>)
-declare <vscale x 32 x i64> @llvm.experimental.vector.reverse.nxv32i64(<vscale x 32 x i64>)
-declare <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1>)
-declare <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1>)
-declare <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1>)
-declare <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1>)
+declare <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8>)
+declare <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8>)
+declare <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16>)
+declare <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16>)
+declare <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16>)
+declare <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16>)
+declare <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32>)
+declare <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64>)
+declare <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64>)
+declare <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64>)
+declare <vscale x 16 x i64> @llvm.vector.reverse.nxv16i64(<vscale x 16 x i64>)
+declare <vscale x 32 x i64> @llvm.vector.reverse.nxv32i64(<vscale x 32 x i64>)
+declare <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1>)
+declare <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1>)
+declare <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1>)
+declare <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1>)
 
 
 define void @vector_splice() {
 ; CHECK-LABEL: 'vector_splice'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'vector_splice'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
-  %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
-  %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
-  %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
-  %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
-  %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
-  %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
-  %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
-  %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
-  %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
-  %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
-  %splice_nxv8i1 =  call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
-  %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
-  %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
+  %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
+  %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
+  %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
+  %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
+  %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
+  %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
+  %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
+  %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
+  %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
+  %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
+  %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
+  %splice_nxv8i1 =  call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
+  %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
+  %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
   ret void
 }
 
-declare <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
-declare <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
-declare <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
-declare <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
-declare <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
-declare <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
-declare <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, i32)
-declare <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, i32)
-declare <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32)
-declare <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
-declare <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, i32)
-declare <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
-declare <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
-declare <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
-declare <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i32)
+declare <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
+declare <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
+declare <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
+declare <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, i32)
+declare <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, i32)
+declare <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, i32)
+declare <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+declare <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
+declare <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
+declare <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i32)
diff --git a/llvm/test/Analysis/CostModel/RISCV/splice.ll b/llvm/test/Analysis/CostModel/RISCV/splice.ll
index c70c879dba5a..9acccef9c4f6 100644
--- a/llvm/test/Analysis/CostModel/RISCV/splice.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/splice.ll
@@ -4,220 +4,220 @@
 
 define void @vector_splice() {
 ; CHECK-LABEL: 'vector_splice'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i8 = call <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i8 = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i8 = call <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i8 = call <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8> zeroinitializer, <vscale x 8 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv64i8 = call <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8> zeroinitializer, <vscale x 64 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i16 = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv32i16 = call <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16> zeroinitializer, <vscale x 32 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv64i16 = call <vscale x 64 x i16> @llvm.experimental.vector.splice.nxv64i16(<vscale x 64 x i16> zeroinitializer, <vscale x 64 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i32 = call <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32> zeroinitializer, <vscale x 1 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i32 = call <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv16i32 = call <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv32i32 = call <vscale x 32 x i32> @llvm.experimental.vector.splice.nxv32i32(<vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv64i32 = call <vscale x 64 x i32> @llvm.experimental.vector.splice.nxv64i32(<vscale x 64 x i32> zeroinitializer, <vscale x 64 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i64 = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv8i64 = call <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64> zeroinitializer, <vscale x 8 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv16i64 = call <vscale x 16 x i64> @llvm.experimental.vector.splice.nxv16i64(<vscale x 16 x i64> zeroinitializer, <vscale x 16 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv32i64 = call <vscale x 32 x i64> @llvm.experimental.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %splice.nxv64i64 = call <vscale x 64 x i64> @llvm.experimental.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f16 = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv32f16 = call <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half> zeroinitializer, <vscale x 32 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv64f16 = call <vscale x 64 x half> @llvm.experimental.vector.splice.nxv64f16(<vscale x 64 x half> zeroinitializer, <vscale x 64 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f32 = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv16f32 = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> zeroinitializer, <vscale x 16 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv32f32 = call <vscale x 32 x float> @llvm.experimental.vector.splice.nxv32f32(<vscale x 32 x float> zeroinitializer, <vscale x 32 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv64f32 = call <vscale x 64 x float> @llvm.experimental.vector.splice.nxv64f32(<vscale x 64 x float> zeroinitializer, <vscale x 64 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f64 = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv8f64 = call <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double> zeroinitializer, <vscale x 8 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv16f64 = call <vscale x 16 x double> @llvm.experimental.vector.splice.nxv16f64(<vscale x 16 x double> zeroinitializer, <vscale x 16 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv32f64 = call <vscale x 32 x double> @llvm.experimental.vector.splice.nxv32f64(<vscale x 32 x double> zeroinitializer, <vscale x 32 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %splice.nxv64f64 = call <vscale x 64 x double> @llvm.experimental.vector.splice.nxv64f64(<vscale x 64 x double> zeroinitializer, <vscale x 64 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i8 = call <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i8 = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i8 = call <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i8 = call <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8> zeroinitializer, <vscale x 8 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv64i8 = call <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8> zeroinitializer, <vscale x 64 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i16 = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv32i16 = call <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16> zeroinitializer, <vscale x 32 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv64i16 = call <vscale x 64 x i16> @llvm.vector.splice.nxv64i16(<vscale x 64 x i16> zeroinitializer, <vscale x 64 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i32 = call <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32> zeroinitializer, <vscale x 1 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i32 = call <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv16i32 = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv32i32 = call <vscale x 32 x i32> @llvm.vector.splice.nxv32i32(<vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv64i32 = call <vscale x 64 x i32> @llvm.vector.splice.nxv64i32(<vscale x 64 x i32> zeroinitializer, <vscale x 64 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i64 = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv8i64 = call <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64> zeroinitializer, <vscale x 8 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv16i64 = call <vscale x 16 x i64> @llvm.vector.splice.nxv16i64(<vscale x 16 x i64> zeroinitializer, <vscale x 16 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv32i64 = call <vscale x 32 x i64> @llvm.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %splice.nxv64i64 = call <vscale x 64 x i64> @llvm.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f16 = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f16 = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f16 = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv8f16 = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv16f16 = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv32f16 = call <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half> zeroinitializer, <vscale x 32 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv64f16 = call <vscale x 64 x half> @llvm.vector.splice.nxv64f16(<vscale x 64 x half> zeroinitializer, <vscale x 64 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f32 = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f32 = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv4f32 = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv8f32 = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv16f32 = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> zeroinitializer, <vscale x 16 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv32f32 = call <vscale x 32 x float> @llvm.vector.splice.nxv32f32(<vscale x 32 x float> zeroinitializer, <vscale x 32 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv64f32 = call <vscale x 64 x float> @llvm.vector.splice.nxv64f32(<vscale x 64 x float> zeroinitializer, <vscale x 64 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f64 = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv2f64 = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv4f64 = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv8f64 = call <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double> zeroinitializer, <vscale x 8 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv16f64 = call <vscale x 16 x double> @llvm.vector.splice.nxv16f64(<vscale x 16 x double> zeroinitializer, <vscale x 16 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv32f64 = call <vscale x 32 x double> @llvm.vector.splice.nxv32f64(<vscale x 32 x double> zeroinitializer, <vscale x 32 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %splice.nxv64f64 = call <vscale x 64 x double> @llvm.vector.splice.nxv64f64(<vscale x 64 x double> zeroinitializer, <vscale x 64 x double> zeroinitializer, i32 -1)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'vector_splice'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i8 = call <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i8 = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i8 = call <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x i8> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i8 = call <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8> zeroinitializer, <vscale x 8 x i8> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv64i8 = call <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8> zeroinitializer, <vscale x 64 x i8> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i16 = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv32i16 = call <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16> zeroinitializer, <vscale x 32 x i16> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv64i16 = call <vscale x 64 x i16> @llvm.experimental.vector.splice.nxv64i16(<vscale x 64 x i16> zeroinitializer, <vscale x 64 x i16> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i32 = call <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32> zeroinitializer, <vscale x 1 x i32> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i32 = call <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16i32 = call <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv32i32 = call <vscale x 32 x i32> @llvm.experimental.vector.splice.nxv32i32(<vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv64i32 = call <vscale x 64 x i32> @llvm.experimental.vector.splice.nxv64i32(<vscale x 64 x i32> zeroinitializer, <vscale x 64 x i32> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i64 = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i64 = call <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64> zeroinitializer, <vscale x 8 x i64> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv16i64 = call <vscale x 16 x i64> @llvm.experimental.vector.splice.nxv16i64(<vscale x 16 x i64> zeroinitializer, <vscale x 16 x i64> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv32i64 = call <vscale x 32 x i64> @llvm.experimental.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv64i64 = call <vscale x 64 x i64> @llvm.experimental.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f16 = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv32f16 = call <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half> zeroinitializer, <vscale x 32 x half> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv64f16 = call <vscale x 64 x half> @llvm.experimental.vector.splice.nxv64f16(<vscale x 64 x half> zeroinitializer, <vscale x 64 x half> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f32 = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16f32 = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> zeroinitializer, <vscale x 16 x float> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv32f32 = call <vscale x 32 x float> @llvm.experimental.vector.splice.nxv32f32(<vscale x 32 x float> zeroinitializer, <vscale x 32 x float> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv64f32 = call <vscale x 64 x float> @llvm.experimental.vector.splice.nxv64f32(<vscale x 64 x float> zeroinitializer, <vscale x 64 x float> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f64 = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8f64 = call <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double> zeroinitializer, <vscale x 8 x double> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv16f64 = call <vscale x 16 x double> @llvm.experimental.vector.splice.nxv16f64(<vscale x 16 x double> zeroinitializer, <vscale x 16 x double> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv32f64 = call <vscale x 32 x double> @llvm.experimental.vector.splice.nxv32f64(<vscale x 32 x double> zeroinitializer, <vscale x 32 x double> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv64f64 = call <vscale x 64 x double> @llvm.experimental.vector.splice.nxv64f64(<vscale x 64 x double> zeroinitializer, <vscale x 64 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i8 = call <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i8 = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i8 = call <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i8 = call <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8> zeroinitializer, <vscale x 8 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv64i8 = call <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8> zeroinitializer, <vscale x 64 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i16 = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv32i16 = call <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16> zeroinitializer, <vscale x 32 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv64i16 = call <vscale x 64 x i16> @llvm.vector.splice.nxv64i16(<vscale x 64 x i16> zeroinitializer, <vscale x 64 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i32 = call <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32> zeroinitializer, <vscale x 1 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i32 = call <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16i32 = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv32i32 = call <vscale x 32 x i32> @llvm.vector.splice.nxv32i32(<vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv64i32 = call <vscale x 64 x i32> @llvm.vector.splice.nxv64i32(<vscale x 64 x i32> zeroinitializer, <vscale x 64 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i64 = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i64 = call <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64> zeroinitializer, <vscale x 8 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv16i64 = call <vscale x 16 x i64> @llvm.vector.splice.nxv16i64(<vscale x 16 x i64> zeroinitializer, <vscale x 16 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv32i64 = call <vscale x 32 x i64> @llvm.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv64i64 = call <vscale x 64 x i64> @llvm.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f16 = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f16 = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f16 = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8f16 = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16f16 = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv32f16 = call <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half> zeroinitializer, <vscale x 32 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv64f16 = call <vscale x 64 x half> @llvm.vector.splice.nxv64f16(<vscale x 64 x half> zeroinitializer, <vscale x 64 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f32 = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f32 = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f32 = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8f32 = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16f32 = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> zeroinitializer, <vscale x 16 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv32f32 = call <vscale x 32 x float> @llvm.vector.splice.nxv32f32(<vscale x 32 x float> zeroinitializer, <vscale x 32 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv64f32 = call <vscale x 64 x float> @llvm.vector.splice.nxv64f32(<vscale x 64 x float> zeroinitializer, <vscale x 64 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f64 = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f64 = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f64 = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8f64 = call <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double> zeroinitializer, <vscale x 8 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv16f64 = call <vscale x 16 x double> @llvm.vector.splice.nxv16f64(<vscale x 16 x double> zeroinitializer, <vscale x 16 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv32f64 = call <vscale x 32 x double> @llvm.vector.splice.nxv32f64(<vscale x 32 x double> zeroinitializer, <vscale x 32 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv64f64 = call <vscale x 64 x double> @llvm.vector.splice.nxv64f64(<vscale x 64 x double> zeroinitializer, <vscale x 64 x double> zeroinitializer, i32 -1)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %splice.nxv1i8 = call <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8> zeroinitializer, i32 -1)
-  %splice.nxv2i8 = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> zeroinitializer, i32 -1)
-  %splice.nxv4i8 = call <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x i8> zeroinitializer, i32 -1)
-  %splice.nxv8i8 = call <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8> zeroinitializer, <vscale x 8 x i8> zeroinitializer, i32 -1)
-  %splice.nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
-  %splice.nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
-  %splice.nxv64i8 = call <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8> zeroinitializer, <vscale x 64 x i8> zeroinitializer, i32 -1)
+  %splice.nxv1i8 = call <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8> zeroinitializer, i32 -1)
+  %splice.nxv2i8 = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> zeroinitializer, i32 -1)
+  %splice.nxv4i8 = call <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x i8> zeroinitializer, i32 -1)
+  %splice.nxv8i8 = call <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8> zeroinitializer, <vscale x 8 x i8> zeroinitializer, i32 -1)
+  %splice.nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
+  %splice.nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
+  %splice.nxv64i8 = call <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8> zeroinitializer, <vscale x 64 x i8> zeroinitializer, i32 -1)
 
-  %splice.nxv1i16 = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
-  %splice.nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
-  %splice.nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
-  %splice.nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
-  %splice.nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
-  %splice.nxv32i16 = call <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16> zeroinitializer, <vscale x 32 x i16> zeroinitializer, i32 -1)
-  %splice.nxv64i16 = call <vscale x 64 x i16> @llvm.experimental.vector.splice.nxv64i16(<vscale x 64 x i16> zeroinitializer, <vscale x 64 x i16> zeroinitializer, i32 -1)
+  %splice.nxv1i16 = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
+  %splice.nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
+  %splice.nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
+  %splice.nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
+  %splice.nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
+  %splice.nxv32i16 = call <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16> zeroinitializer, <vscale x 32 x i16> zeroinitializer, i32 -1)
+  %splice.nxv64i16 = call <vscale x 64 x i16> @llvm.vector.splice.nxv64i16(<vscale x 64 x i16> zeroinitializer, <vscale x 64 x i16> zeroinitializer, i32 -1)
 
-  %splice.nxv1i32 = call <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32> zeroinitializer, <vscale x 1 x i32> zeroinitializer, i32 -1)
-  %splice.nxv2i32 = call <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> zeroinitializer, i32 -1)
-  %splice.nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
-  %splice.nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
-  %splice.nxv16i32 = call <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> zeroinitializer, i32 -1)
-  %splice.nxv32i32 = call <vscale x 32 x i32> @llvm.experimental.vector.splice.nxv32i32(<vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, i32 -1)
-  %splice.nxv64i32 = call <vscale x 64 x i32> @llvm.experimental.vector.splice.nxv64i32(<vscale x 64 x i32> zeroinitializer, <vscale x 64 x i32> zeroinitializer, i32 -1)
+  %splice.nxv1i32 = call <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32> zeroinitializer, <vscale x 1 x i32> zeroinitializer, i32 -1)
+  %splice.nxv2i32 = call <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> zeroinitializer, i32 -1)
+  %splice.nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
+  %splice.nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
+  %splice.nxv16i32 = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> zeroinitializer, i32 -1)
+  %splice.nxv32i32 = call <vscale x 32 x i32> @llvm.vector.splice.nxv32i32(<vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, i32 -1)
+  %splice.nxv64i32 = call <vscale x 64 x i32> @llvm.vector.splice.nxv64i32(<vscale x 64 x i32> zeroinitializer, <vscale x 64 x i32> zeroinitializer, i32 -1)
 
-  %splice.nxv1i64 = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
-  %splice.nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
-  %splice.nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
-  %splice.nxv8i64 = call <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64> zeroinitializer, <vscale x 8 x i64> zeroinitializer, i32 -1)
-  %splice.nxv16i64 = call <vscale x 16 x i64> @llvm.experimental.vector.splice.nxv16i64(<vscale x 16 x i64> zeroinitializer, <vscale x 16 x i64> zeroinitializer, i32 -1)
-  %splice.nxv32i64 = call <vscale x 32 x i64> @llvm.experimental.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
-  %splice.nxv64i64 = call <vscale x 64 x i64> @llvm.experimental.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
+  %splice.nxv1i64 = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
+  %splice.nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
+  %splice.nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
+  %splice.nxv8i64 = call <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64> zeroinitializer, <vscale x 8 x i64> zeroinitializer, i32 -1)
+  %splice.nxv16i64 = call <vscale x 16 x i64> @llvm.vector.splice.nxv16i64(<vscale x 16 x i64> zeroinitializer, <vscale x 16 x i64> zeroinitializer, i32 -1)
+  %splice.nxv32i64 = call <vscale x 32 x i64> @llvm.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
+  %splice.nxv64i64 = call <vscale x 64 x i64> @llvm.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
 
-  %splice.nxv1f16 = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
-  %splice.nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
-  %splice.nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
-  %splice.nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
-  %splice.nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
-  %splice.nxv32f16 = call <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half> zeroinitializer, <vscale x 32 x half> zeroinitializer, i32 -1)
-  %splice.nxv64f16 = call <vscale x 64 x half> @llvm.experimental.vector.splice.nxv64f16(<vscale x 64 x half> zeroinitializer, <vscale x 64 x half> zeroinitializer, i32 -1)
+  %splice.nxv1f16 = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
+  %splice.nxv2f16 = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
+  %splice.nxv4f16 = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
+  %splice.nxv8f16 = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
+  %splice.nxv16f16 = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
+  %splice.nxv32f16 = call <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half> zeroinitializer, <vscale x 32 x half> zeroinitializer, i32 -1)
+  %splice.nxv64f16 = call <vscale x 64 x half> @llvm.vector.splice.nxv64f16(<vscale x 64 x half> zeroinitializer, <vscale x 64 x half> zeroinitializer, i32 -1)
 
-  %splice.nxv1f32 = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
-  %splice.nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
-  %splice.nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
-  %splice.nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
-  %splice.nxv16f32 = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> zeroinitializer, <vscale x 16 x float> zeroinitializer, i32 -1)
-  %splice.nxv32f32 = call <vscale x 32 x float> @llvm.experimental.vector.splice.nxv32f32(<vscale x 32 x float> zeroinitializer, <vscale x 32 x float> zeroinitializer, i32 -1)
-  %splice.nxv64f32 = call <vscale x 64 x float> @llvm.experimental.vector.splice.nxv64f32(<vscale x 64 x float> zeroinitializer, <vscale x 64 x float> zeroinitializer, i32 -1)
+  %splice.nxv1f32 = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
+  %splice.nxv2f32 = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
+  %splice.nxv4f32 = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
+  %splice.nxv8f32 = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
+  %splice.nxv16f32 = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> zeroinitializer, <vscale x 16 x float> zeroinitializer, i32 -1)
+  %splice.nxv32f32 = call <vscale x 32 x float> @llvm.vector.splice.nxv32f32(<vscale x 32 x float> zeroinitializer, <vscale x 32 x float> zeroinitializer, i32 -1)
+  %splice.nxv64f32 = call <vscale x 64 x float> @llvm.vector.splice.nxv64f32(<vscale x 64 x float> zeroinitializer, <vscale x 64 x float> zeroinitializer, i32 -1)
 
-  %splice.nxv1f64 = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
-  %splice.nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
-  %splice.nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
-  %splice.nxv8f64 = call <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double> zeroinitializer, <vscale x 8 x double> zeroinitializer, i32 -1)
-  %splice.nxv16f64 = call <vscale x 16 x double> @llvm.experimental.vector.splice.nxv16f64(<vscale x 16 x double> zeroinitializer, <vscale x 16 x double> zeroinitializer, i32 -1)
-  %splice.nxv32f64 = call <vscale x 32 x double> @llvm.experimental.vector.splice.nxv32f64(<vscale x 32 x double> zeroinitializer, <vscale x 32 x double> zeroinitializer, i32 -1)
-  %splice.nxv64f64 = call <vscale x 64 x double> @llvm.experimental.vector.splice.nxv64f64(<vscale x 64 x double> zeroinitializer, <vscale x 64 x double> zeroinitializer, i32 -1)
+  %splice.nxv1f64 = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
+  %splice.nxv2f64 = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
+  %splice.nxv4f64 = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
+  %splice.nxv8f64 = call <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double> zeroinitializer, <vscale x 8 x double> zeroinitializer, i32 -1)
+  %splice.nxv16f64 = call <vscale x 16 x double> @llvm.vector.splice.nxv16f64(<vscale x 16 x double> zeroinitializer, <vscale x 16 x double> zeroinitializer, i32 -1)
+  %splice.nxv32f64 = call <vscale x 32 x double> @llvm.vector.splice.nxv32f64(<vscale x 32 x double> zeroinitializer, <vscale x 32 x double> zeroinitializer, i32 -1)
+  %splice.nxv64f64 = call <vscale x 64 x double> @llvm.vector.splice.nxv64f64(<vscale x 64 x double> zeroinitializer, <vscale x 64 x double> zeroinitializer, i32 -1)
 
   ret void
 }
 
-declare <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, i32)
-declare <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
-declare <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, i32)
-declare <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, i32)
-declare <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
-declare <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, i32)
-declare <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, i32)
+declare <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, i32)
+declare <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
+declare <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, i32)
+declare <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, i32)
+declare <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
+declare <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, i32)
+declare <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, i32)
 
-declare <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, i32)
-declare <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, i32)
-declare <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32)
-declare <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
-declare <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, i32)
-declare <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, i32)
-declare <vscale x 64 x i16> @llvm.experimental.vector.splice.nxv64i16(<vscale x 64 x i16>, <vscale x 64 x i16>, i32)
+declare <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, i32)
+declare <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, i32)
+declare <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, i32)
+declare <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, i32)
+declare <vscale x 64 x i16> @llvm.vector.splice.nxv64i16(<vscale x 64 x i16>, <vscale x 64 x i16>, i32)
 
-declare <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, i32)
-declare <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, i32)
-declare <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
-declare <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
-declare <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, i32)
-declare <vscale x 32 x i32> @llvm.experimental.vector.splice.nxv32i32(<vscale x 32 x i32>, <vscale x 32 x i32>, i32)
-declare <vscale x 64 x i32> @llvm.experimental.vector.splice.nxv64i32(<vscale x 64 x i32>, <vscale x 64 x i32>, i32)
+declare <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, i32)
+declare <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, i32)
+declare <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+declare <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
+declare <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, i32)
+declare <vscale x 32 x i32> @llvm.vector.splice.nxv32i32(<vscale x 32 x i32>, <vscale x 32 x i32>, i32)
+declare <vscale x 64 x i32> @llvm.vector.splice.nxv64i32(<vscale x 64 x i32>, <vscale x 64 x i32>, i32)
 
-declare <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, i32)
-declare <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
-declare <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i32)
-declare <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, i32)
-declare <vscale x 16 x i64> @llvm.experimental.vector.splice.nxv16i64(<vscale x 16 x i64>, <vscale x 16 x i64>, i32)
-declare <vscale x 32 x i64> @llvm.experimental.vector.splice.nxv32i64(<vscale x 32 x i64>, <vscale x 32 x i64>, i32)
-declare <vscale x 64 x i64> @llvm.experimental.vector.splice.nxv64i64(<vscale x 64 x i64>, <vscale x 64 x i64>, i32)
+declare <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, i32)
+declare <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
+declare <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i32)
+declare <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, i32)
+declare <vscale x 16 x i64> @llvm.vector.splice.nxv16i64(<vscale x 16 x i64>, <vscale x 16 x i64>, i32)
+declare <vscale x 32 x i64> @llvm.vector.splice.nxv32i64(<vscale x 32 x i64>, <vscale x 32 x i64>, i32)
+declare <vscale x 64 x i64> @llvm.vector.splice.nxv64i64(<vscale x 64 x i64>, <vscale x 64 x i64>, i32)
 
-declare <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, i32)
-declare <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
-declare <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
-declare <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
-declare <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>, i32)
-declare <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half>, <vscale x 32 x half>, i32)
-declare <vscale x 64 x half> @llvm.experimental.vector.splice.nxv64f16(<vscale x 64 x half>, <vscale x 64 x half>, i32)
+declare <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, i32)
+declare <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
+declare <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
+declare <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
+declare <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>, i32)
+declare <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half>, <vscale x 32 x half>, i32)
+declare <vscale x 64 x half> @llvm.vector.splice.nxv64f16(<vscale x 64 x half>, <vscale x 64 x half>, i32)
 
-declare <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float>, <vscale x 1 x float>, i32)
-declare <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
-declare <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
-declare <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>, i32)
-declare <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
-declare <vscale x 32 x float> @llvm.experimental.vector.splice.nxv32f32(<vscale x 32 x float>, <vscale x 32 x float>, i32)
-declare <vscale x 64 x float> @llvm.experimental.vector.splice.nxv64f32(<vscale x 64 x float>, <vscale x 64 x float>, i32)
+declare <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float>, <vscale x 1 x float>, i32)
+declare <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
+declare <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
+declare <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>, i32)
+declare <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
+declare <vscale x 32 x float> @llvm.vector.splice.nxv32f32(<vscale x 32 x float>, <vscale x 32 x float>, i32)
+declare <vscale x 64 x float> @llvm.vector.splice.nxv64f32(<vscale x 64 x float>, <vscale x 64 x float>, i32)
 
-declare <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double>, <vscale x 1 x double>, i32)
-declare <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
-declare <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>, i32)
-declare <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double>, <vscale x 8 x double>, i32)
-declare <vscale x 16 x double> @llvm.experimental.vector.splice.nxv16f64(<vscale x 16 x double>, <vscale x 16 x double>, i32)
-declare <vscale x 32 x double> @llvm.experimental.vector.splice.nxv32f64(<vscale x 32 x double>, <vscale x 32 x double>, i32)
-declare <vscale x 64 x double> @llvm.experimental.vector.splice.nxv64f64(<vscale x 64 x double>, <vscale x 64 x double>, i32)
+declare <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double>, <vscale x 1 x double>, i32)
+declare <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
+declare <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>, i32)
+declare <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double>, <vscale x 8 x double>, i32)
+declare <vscale x 16 x double> @llvm.vector.splice.nxv16f64(<vscale x 16 x double>, <vscale x 16 x double>, i32)
+declare <vscale x 32 x double> @llvm.vector.splice.nxv32f64(<vscale x 32 x double>, <vscale x 32 x double>, i32)
+declare <vscale x 64 x double> @llvm.vector.splice.nxv64f64(<vscale x 64 x double>, <vscale x 64 x double>, i32)
diff --git a/llvm/test/Bitcode/upgrade-vector-interleave2-deinterleave2-intrinsics.ll b/llvm/test/Bitcode/upgrade-vector-interleave2-deinterleave2-intrinsics.ll
new file mode 100644
index 000000000000..f06395945297
--- /dev/null
+++ b/llvm/test/Bitcode/upgrade-vector-interleave2-deinterleave2-intrinsics.ll
@@ -0,0 +1,46 @@
+; RUN: opt -S < %s | FileCheck %s
+; RUN: llvm-as %s -o - | llvm-dis | FileCheck %s
+
+define <8 x i32> @interleave_fixed(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @interleave_fixed
+; CHECK: %res = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b)
+
+  %res = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b)
+  ret <8 x i32> %res
+}
+
+define { <4 x i32>, <4 x i32> } @deinterleave_fixed(<8 x i32> %a) {
+; CHECK-LABEL: @deinterleave_fixed
+; CHECK: %res = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %a)
+
+  %res = call { <4 x i32>, <4 x i32> } @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %a)
+  ret { <4 x i32>, <4 x i32> } %res
+}
+
+define <vscale x 8 x i32> @interleave_scalable(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: @interleave_scalable
+; CHECK: %res = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+
+  %res = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  ret <vscale x 8 x i32> %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32> } @deinterleave_scalable(<vscale x 8 x i32> %a) {
+; CHECK-LABEL: @deinterleave_scalable
+; CHECK: %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %a)
+
+  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %a)
+  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
+; CHECK: <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
+
+declare  { <4 x i32>, <4 x i32> } @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>)
+; CHECK: declare  { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32>)
+
+declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+; CHECK: <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+
+declare  { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+; CHECK: declare  { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
diff --git a/llvm/test/Bitcode/upgrade-vector-reverse-intrinsic.ll b/llvm/test/Bitcode/upgrade-vector-reverse-intrinsic.ll
new file mode 100644
index 000000000000..6b853eaf4175
--- /dev/null
+++ b/llvm/test/Bitcode/upgrade-vector-reverse-intrinsic.ll
@@ -0,0 +1,24 @@
+; RUN: opt -S < %s | FileCheck %s
+; RUN: llvm-as %s -o - | llvm-dis | FileCheck %s
+
+define <16 x i8> @reverse_fixed(<16 x i8> %a) {
+; CHECK-LABEL: @reverse_fixed
+; CHECK: %res = call <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8> %a)
+
+  %res = call <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8> %a)
+  ret <16 x i8> %res
+}
+
+define <vscale x 16 x i8> @reverse_scalable(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: @reverse_scalable
+; CHECK: %res = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> %a)
+
+  %res = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> %a)
+  ret <vscale x 16 x i8> %res
+}
+
+declare <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8>)
+; CHECK: declare <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8>)
+
+declare <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8>)
+; CHECK: declare <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8>)
diff --git a/llvm/test/Bitcode/upgrade-vector-splice-intrinsic.ll b/llvm/test/Bitcode/upgrade-vector-splice-intrinsic.ll
new file mode 100644
index 000000000000..1b55da21ecd2
--- /dev/null
+++ b/llvm/test/Bitcode/upgrade-vector-splice-intrinsic.ll
@@ -0,0 +1,24 @@
+; RUN: opt -S < %s | FileCheck %s
+; RUN: llvm-as %s -o - | llvm-dis | FileCheck %s
+
+define <8 x half> @splice_fixed(<8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: @splice_fixed
+; CHECK: %res = call <8 x half> @llvm.vector.splice.v8f16(<8 x half> %a, <8 x half> %b, i32 2)
+
+  %res = call <8 x half> @llvm.experimental.vector.splice.v8f16(<8 x half> %a, <8 x half> %b, i32 2)
+  ret <8 x half> %res
+}
+
+define <vscale x 8 x half> @splice_scalable(<vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: @splice_scalable
+; CHECK: %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 2)
+
+  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 2)
+  ret <vscale x 8 x half> %res
+}
+
+declare <8 x half> @llvm.experimental.vector.splice.v8f16(<8 x half>, <8 x half>, i32 immarg)
+; CHECK: declare <8 x half> @llvm.vector.splice.v8f16(<8 x half>, <8 x half>, i32 immarg)
+
+declare <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32 immarg)
+; CHECK: declare <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32 immarg)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-deinterleave2.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-deinterleave2.ll
index 10882a06af1b..0b7fae47a65a 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-deinterleave2.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-deinterleave2.ll
@@ -11,7 +11,7 @@ define void @vector_deinterleave2_v4i32(<4 x i32> %a) {
   ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[DEF]], shufflemask(0, 2)
   ; CHECK-NEXT:   [[SHUF1:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[DEF]], shufflemask(1, 3)
   ; CHECK-NEXT:   RET_ReallyLR
-    %res = call {<2 x i32>, <2 x i32>} @llvm.experimental.vector.deinterleave2.v4i32(<4 x i32> %a)
+    %res = call {<2 x i32>, <2 x i32>} @llvm.vector.deinterleave2.v4i32(<4 x i32> %a)
     ret void
 }
 
@@ -29,6 +29,6 @@ define void @vector_deinterleave2_v8f32(<8 x float> %a) {
   ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<8 x s32>), [[DEF]], shufflemask(0, 2, 4, 6)
   ; CHECK-NEXT:   [[SHUF1:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<8 x s32>), [[DEF]], shufflemask(1, 3, 5, 7)
   ; CHECK-NEXT:   RET_ReallyLR
-    %res = call {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %a)
+    %res = call {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float> %a)
     ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-interleave2.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-interleave2.ll
index f51e47a428d1..0d8ac82c1051 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-interleave2.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-interleave2.ll
@@ -10,7 +10,7 @@ define void @vector_interleave2_v4i32(<2 x i32> %a, <2 x i32> %b) {
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1
   ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<2 x s32>), [[COPY1]], shufflemask(0, 2, 1, 3)
   ; CHECK-NEXT:   RET_ReallyLR
-    %res = call <4 x i32> @llvm.experimental.vector.interleave2.v4i32(<2 x i32> %a, <2 x i32> %b)
+    %res = call <4 x i32> @llvm.vector.interleave2.v4i32(<2 x i32> %a, <2 x i32> %b)
     ret void
 }
 
@@ -25,6 +25,6 @@ define void @vector_interleave2_v8f32(<4 x float> %a, <4 x float> %b) {
   ; CHECK-NEXT:   [[BITCAST1:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x s64>)
   ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<8 x s32>) = G_SHUFFLE_VECTOR [[BITCAST]](<4 x s32>), [[BITCAST1]], shufflemask(0, 4, 1, 5, 2, 6, 3, 7)
   ; CHECK-NEXT:   RET_ReallyLR
-    %res = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b)
+    %res = call <8 x float> @llvm.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b)
     ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll
index 86b1d5d195ff..0485d530fd06 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll
@@ -25,10 +25,10 @@ define <vscale x 4 x double> @mull_add(<vscale x 4 x double> %a, <vscale x 4 x d
 ; CHECK-NEXT:    zip2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec29 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec29 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec29, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec29, 1
   %4 = fmul contract <vscale x 2 x double> %0, %3
@@ -37,12 +37,12 @@ entry:
   %7 = fmul contract <vscale x 2 x double> %0, %2
   %8 = fmul contract <vscale x 2 x double> %1, %3
   %9 = fsub contract <vscale x 2 x double> %7, %8
-  %strided.vec31 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec31 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec31, 0
   %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec31, 1
   %12 = fadd contract <vscale x 2 x double> %10, %9
   %13 = fadd contract <vscale x 2 x double> %6, %11
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %12, <vscale x 2 x double> %13)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %12, <vscale x 2 x double> %13)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -67,10 +67,10 @@ define <vscale x 4 x double> @mul_add_mull(<vscale x 4 x double> %a, <vscale x 4
 ; CHECK-NEXT:    fadd z0.d, z25.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec52 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec52 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 1
   %4 = fmul contract <vscale x 2 x double> %0, %3
@@ -79,10 +79,10 @@ entry:
   %7 = fmul contract <vscale x 2 x double> %0, %2
   %8 = fmul contract <vscale x 2 x double> %1, %3
   %9 = fsub contract <vscale x 2 x double> %7, %8
-  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 0
   %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 1
-  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
   %12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 0
   %13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 1
   %14 = fmul contract <vscale x 2 x double> %10, %13
@@ -93,7 +93,7 @@ entry:
   %19 = fsub contract <vscale x 2 x double> %17, %18
   %20 = fadd contract <vscale x 2 x double> %9, %19
   %21 = fadd contract <vscale x 2 x double> %6, %16
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %20, <vscale x 2 x double> %21)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %20, <vscale x 2 x double> %21)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -118,10 +118,10 @@ define <vscale x 4 x double> @mul_sub_mull(<vscale x 4 x double> %a, <vscale x 4
 ; CHECK-NEXT:    fsub z0.d, z25.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec52 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec52 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 1
   %4 = fmul contract <vscale x 2 x double> %0, %3
@@ -130,10 +130,10 @@ entry:
   %7 = fmul contract <vscale x 2 x double> %0, %2
   %8 = fmul contract <vscale x 2 x double> %1, %3
   %9 = fsub contract <vscale x 2 x double> %7, %8
-  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 0
   %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 1
-  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
   %12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 0
   %13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 1
   %14 = fmul contract <vscale x 2 x double> %10, %13
@@ -144,7 +144,7 @@ entry:
   %19 = fsub contract <vscale x 2 x double> %17, %18
   %20 = fsub contract <vscale x 2 x double> %9, %19
   %21 = fsub contract <vscale x 2 x double> %6, %16
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %20, <vscale x 2 x double> %21)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %20, <vscale x 2 x double> %21)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -169,10 +169,10 @@ define <vscale x 4 x double> @mul_conj_mull(<vscale x 4 x double> %a, <vscale x
 ; CHECK-NEXT:    fadd z0.d, z25.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec60 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec60 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec60, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec60, 1
   %4 = fmul contract <vscale x 2 x double> %0, %3
@@ -181,10 +181,10 @@ entry:
   %7 = fmul contract <vscale x 2 x double> %0, %2
   %8 = fmul contract <vscale x 2 x double> %1, %3
   %9 = fsub contract <vscale x 2 x double> %7, %8
-  %strided.vec62 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec62 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec62, 0
   %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec62, 1
-  %strided.vec64 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+  %strided.vec64 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
   %12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec64, 0
   %13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec64, 1
   %14 = fmul contract <vscale x 2 x double> %10, %13
@@ -195,7 +195,7 @@ entry:
   %19 = fadd contract <vscale x 2 x double> %17, %18
   %20 = fadd contract <vscale x 2 x double> %9, %19
   %21 = fadd contract <vscale x 2 x double> %6, %16
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %20, <vscale x 2 x double> %21)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %20, <vscale x 2 x double> %21)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -238,10 +238,10 @@ define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale
 ; CHECK-NEXT:    zip2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec78 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec78 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec78, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec78, 1
   %4 = fmul contract <vscale x 2 x double> %0, %3
@@ -250,14 +250,14 @@ entry:
   %7 = fmul contract <vscale x 2 x double> %0, %2
   %8 = fmul contract <vscale x 2 x double> %1, %3
   %9 = fsub contract <vscale x 2 x double> %7, %8
-  %strided.vec80 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec80 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec80, 0
   %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec80, 1
   %12 = tail call contract <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> %11)
   %13 = fadd contract <vscale x 2 x double> %10, %12
   %14 = tail call contract <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> %10)
   %15 = fsub contract <vscale x 2 x double> %14, %11
-  %strided.vec82 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+  %strided.vec82 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
   %16 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec82, 0
   %17 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec82, 1
   %18 = fmul contract <vscale x 2 x double> %15, %17
@@ -268,10 +268,10 @@ entry:
   %23 = fsub contract <vscale x 2 x double> %21, %22
   %24 = fadd contract <vscale x 2 x double> %9, %23
   %25 = fadd contract <vscale x 2 x double> %6, %20
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %24, <vscale x 2 x double> %25)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %24, <vscale x 2 x double> %25)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
-declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
 declare <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
index edf580e334e8..c643ae9265c0 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
@@ -16,24 +16,24 @@ define <vscale x 4 x double> @mull_add(<vscale x 4 x double> %a, <vscale x 4 x d
 ; CHECK-NEXT:    mov z1.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec29 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec29 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec29, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec29, 1
   %4 = fmul fast <vscale x 2 x double> %3, %0
   %5 = fmul fast <vscale x 2 x double> %2, %1
   %6 = fadd fast <vscale x 2 x double> %4, %5
   %7 = fmul fast <vscale x 2 x double> %2, %0
-  %strided.vec31 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec31 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec31, 0
   %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec31, 1
   %10 = fadd fast <vscale x 2 x double> %8, %7
   %11 = fmul fast <vscale x 2 x double> %3, %1
   %12 = fsub fast <vscale x 2 x double> %10, %11
   %13 = fadd fast <vscale x 2 x double> %6, %9
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %12, <vscale x 2 x double> %13)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %12, <vscale x 2 x double> %13)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -56,20 +56,20 @@ define <vscale x 4 x double> @mul_add_mull(<vscale x 4 x double> %a, <vscale x 4
 ; CHECK-NEXT:    mov z0.d, z25.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec52 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec52 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 1
   %4 = fmul fast <vscale x 2 x double> %3, %0
   %5 = fmul fast <vscale x 2 x double> %2, %1
   %6 = fmul fast <vscale x 2 x double> %2, %0
   %7 = fmul fast <vscale x 2 x double> %3, %1
-  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 0
   %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 1
-  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 0
   %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 1
   %12 = fmul fast <vscale x 2 x double> %11, %8
@@ -82,7 +82,7 @@ entry:
   %19 = fadd fast <vscale x 2 x double> %4, %5
   %20 = fadd fast <vscale x 2 x double> %19, %13
   %21 = fadd fast <vscale x 2 x double> %20, %12
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %18, <vscale x 2 x double> %21)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %18, <vscale x 2 x double> %21)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -105,20 +105,20 @@ define <vscale x 4 x double> @mul_sub_mull(<vscale x 4 x double> %a, <vscale x 4
 ; CHECK-NEXT:    mov z0.d, z25.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 1
   %4 = fmul fast <vscale x 2 x double> %3, %0
   %5 = fmul fast <vscale x 2 x double> %2, %1
   %6 = fmul fast <vscale x 2 x double> %2, %0
   %7 = fmul fast <vscale x 2 x double> %3, %1
-  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 0
   %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 1
-  %strided.vec58 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+  %strided.vec58 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec58, 0
   %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec58, 1
   %12 = fmul fast <vscale x 2 x double> %11, %9
@@ -131,7 +131,7 @@ entry:
   %19 = fadd fast <vscale x 2 x double> %18, %17
   %20 = fadd fast <vscale x 2 x double> %4, %5
   %21 = fsub fast <vscale x 2 x double> %20, %19
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %16, <vscale x 2 x double> %21)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %16, <vscale x 2 x double> %21)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -154,19 +154,19 @@ define <vscale x 4 x double> @mul_conj_mull(<vscale x 4 x double> %a, <vscale x
 ; CHECK-NEXT:    mov z0.d, z25.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec60 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec60 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec60, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec60, 1
   %4 = fmul fast <vscale x 2 x double> %3, %0
   %5 = fmul fast <vscale x 2 x double> %2, %1
   %6 = fmul fast <vscale x 2 x double> %2, %0
-  %strided.vec62 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec62 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec62, 0
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec62, 1
-  %strided.vec64 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+  %strided.vec64 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
   %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec64, 0
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec64, 1
   %11 = fmul fast <vscale x 2 x double> %10, %7
@@ -180,7 +180,7 @@ entry:
   %19 = fmul fast <vscale x 2 x double> %9, %8
   %20 = fsub fast <vscale x 2 x double> %18, %19
   %21 = fadd fast <vscale x 2 x double> %20, %11
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %17, <vscale x 2 x double> %21)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %17, <vscale x 2 x double> %21)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -211,20 +211,20 @@ define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale
 ; CHECK-NEXT:    zip2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec80 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec80 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec80, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec80, 1
   %4 = fmul fast <vscale x 2 x double> %3, %0
   %5 = fmul fast <vscale x 2 x double> %2, %1
   %6 = fmul fast <vscale x 2 x double> %2, %0
   %7 = fmul fast <vscale x 2 x double> %3, %1
-  %strided.vec82 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec82 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec82, 0
   %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec82, 1
-  %strided.vec84 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+  %strided.vec84 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec84, 0
   %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec84, 1
   %12 = fmul fast <vscale x 2 x double> %10, %8
@@ -237,9 +237,9 @@ entry:
   %19 = fadd fast <vscale x 2 x double> %18, %12
   %20 = fmul fast <vscale x 2 x double> %11, %9
   %21 = fsub fast <vscale x 2 x double> %19, %20
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %17, <vscale x 2 x double> %21)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %17, <vscale x 2 x double> %21)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
-declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
index 48b5756b01fb..dae8d9f89e99 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
@@ -24,15 +24,15 @@ define <vscale x 4 x half> @complex_add_v4f16(<vscale x 4 x half> %a, <vscale x
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %a)
   %a.real = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %b)
   %b.real = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 2 x half> %b.real, %a.imag
   %1 = fadd fast <vscale x 2 x half> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1)
+  %interleaved.vec = tail call <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1)
   ret <vscale x 4 x half> %interleaved.vec
 }
 
@@ -45,15 +45,15 @@ define <vscale x 8 x half> @complex_add_v8f16(<vscale x 8 x half> %a, <vscale x
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %a)
   %a.real = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %b)
   %b.real = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 4 x half> %b.real, %a.imag
   %1 = fadd fast <vscale x 4 x half> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1)
+  %interleaved.vec = tail call <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1)
   ret <vscale x 8 x half> %interleaved.vec
 }
 
@@ -68,15 +68,15 @@ define <vscale x 16 x half> @complex_add_v16f16(<vscale x 16 x half> %a, <vscale
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %a)
   %a.real = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %b)
   %b.real = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 8 x half> %b.real, %a.imag
   %1 = fadd fast <vscale x 8 x half> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1)
+  %interleaved.vec = tail call <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1)
   ret <vscale x 16 x half> %interleaved.vec
 }
 
@@ -95,26 +95,26 @@ define <vscale x 32 x half> @complex_add_v32f16(<vscale x 32 x half> %a, <vscale
 ; CHECK-NEXT:    mov z3.d, z7.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %a)
+  %a.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %a)
   %a.real = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %b)
+  %b.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %b)
   %b.real = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 16 x half> %b.real, %a.imag
   %1 = fadd fast <vscale x 16 x half> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 32 x half> @llvm.experimental.vector.interleave2.nxv32f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1)
+  %interleaved.vec = tail call <vscale x 32 x half> @llvm.vector.interleave2.nxv32f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1)
   ret <vscale x 32 x half> %interleaved.vec
 }
 
-declare { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
-declare <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
+declare { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
+declare <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
 
-declare { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
-declare <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
+declare { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
+declare <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
 
-declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
-declare <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
+declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
+declare <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
 
-declare { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half>)
-declare <vscale x 32 x half> @llvm.experimental.vector.interleave2.nxv32f16(<vscale x 16 x half>, <vscale x 16 x half>)
+declare { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.vector.deinterleave2.nxv32f16(<vscale x 32 x half>)
+declare <vscale x 32 x half> @llvm.vector.interleave2.nxv32f16(<vscale x 16 x half>, <vscale x 16 x half>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
index 7cdb10e7159f..a5c64c0982d0 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
@@ -127,15 +127,15 @@ define <4 x half> @complex_add_v4f16_with_intrinsic(<4 x half> %a, <4 x half> %b
 ; CHECK-NEXT:    fcadd v0.4h, v1.4h, v0.4h, #90
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <2 x half>, <2 x half> } @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %a)
+  %a.deinterleaved = tail call { <2 x half>, <2 x half> } @llvm.vector.deinterleave2.v4f16(<4 x half> %a)
   %a.real = extractvalue { <2 x half>, <2 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <2 x half>, <2 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <2 x half>, <2 x half> } @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %b)
+  %b.deinterleaved = tail call { <2 x half>, <2 x half> } @llvm.vector.deinterleave2.v4f16(<4 x half> %b)
   %b.real = extractvalue { <2 x half>, <2 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <2 x half>, <2 x half> } %b.deinterleaved, 1
   %0 = fsub fast <2 x half> %b.real, %a.imag
   %1 = fadd fast <2 x half> %b.imag, %a.real
-  %interleaved.vec = tail call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %0, <2 x half> %1)
+  %interleaved.vec = tail call <4 x half> @llvm.vector.interleave2.v4f16(<2 x half> %0, <2 x half> %1)
   ret <4 x half> %interleaved.vec
 }
 
@@ -146,15 +146,15 @@ define <8 x half> @complex_add_v8f16_with_intrinsic(<8 x half> %a, <8 x half> %b
 ; CHECK-NEXT:    fcadd v0.8h, v1.8h, v0.8h, #90
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <4 x half>, <4 x half> } @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %a)
+  %a.deinterleaved = tail call { <4 x half>, <4 x half> } @llvm.vector.deinterleave2.v8f16(<8 x half> %a)
   %a.real = extractvalue { <4 x half>, <4 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <4 x half>, <4 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <4 x half>, <4 x half> } @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %b)
+  %b.deinterleaved = tail call { <4 x half>, <4 x half> } @llvm.vector.deinterleave2.v8f16(<8 x half> %b)
   %b.real = extractvalue { <4 x half>, <4 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <4 x half>, <4 x half> } %b.deinterleaved, 1
   %0 = fsub fast <4 x half> %b.real, %a.imag
   %1 = fadd fast <4 x half> %b.imag, %a.real
-  %interleaved.vec = tail call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %0, <4 x half> %1)
+  %interleaved.vec = tail call <8 x half> @llvm.vector.interleave2.v8f16(<4 x half> %0, <4 x half> %1)
   ret <8 x half> %interleaved.vec
 }
 
@@ -166,15 +166,15 @@ define <16 x half> @complex_add_v16f16_with_intrinsic(<16 x half> %a, <16 x half
 ; CHECK-NEXT:    fcadd v0.8h, v2.8h, v0.8h, #90
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <8 x half>, <8 x half> } @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %a)
+  %a.deinterleaved = tail call { <8 x half>, <8 x half> } @llvm.vector.deinterleave2.v16f16(<16 x half> %a)
   %a.real = extractvalue { <8 x half>, <8 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <8 x half>, <8 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <8 x half>, <8 x half> } @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %b)
+  %b.deinterleaved = tail call { <8 x half>, <8 x half> } @llvm.vector.deinterleave2.v16f16(<16 x half> %b)
   %b.real = extractvalue { <8 x half>, <8 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <8 x half>, <8 x half> } %b.deinterleaved, 1
   %0 = fsub fast <8 x half> %b.real, %a.imag
   %1 = fadd fast <8 x half> %b.imag, %a.real
-  %interleaved.vec = tail call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %0, <8 x half> %1)
+  %interleaved.vec = tail call <16 x half> @llvm.vector.interleave2.v16f16(<8 x half> %0, <8 x half> %1)
   ret <16 x half> %interleaved.vec
 }
 
@@ -216,11 +216,11 @@ entry:
 }
 
 
-declare { <2 x half>, <2 x half> } @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>)
-declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>)
+declare { <2 x half>, <2 x half> } @llvm.vector.deinterleave2.v4f16(<4 x half>)
+declare <4 x half> @llvm.vector.interleave2.v4f16(<2 x half>, <2 x half>)
 
-declare { <4 x half>, <4 x half> } @llvm.experimental.vector.deinterleave2.v8f16(<8 x half>)
-declare <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half>, <4 x half>)
+declare { <4 x half>, <4 x half> } @llvm.vector.deinterleave2.v8f16(<8 x half>)
+declare <8 x half> @llvm.vector.interleave2.v8f16(<4 x half>, <4 x half>)
 
-declare { <8 x half>, <8 x half> } @llvm.experimental.vector.deinterleave2.v16f16(<16 x half>)
-declare <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half>, <8 x half>)
+declare { <8 x half>, <8 x half> } @llvm.vector.deinterleave2.v16f16(<16 x half>)
+declare <16 x half> @llvm.vector.interleave2.v16f16(<8 x half>, <8 x half>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
index cb285c05b2e8..c09ec616b015 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
@@ -27,10 +27,10 @@ define <vscale x 4 x half> @complex_mul_v4f16(<vscale x 4 x half> %a, <vscale x
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %a)
   %a.real = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %b)
   %b.real = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 2 x half> %b.imag, %a.real
@@ -39,7 +39,7 @@ entry:
   %3 = fmul fast <vscale x 2 x half> %b.real, %a.real
   %4 = fmul fast <vscale x 2 x half> %a.imag, %b.imag
   %5 = fsub fast <vscale x 2 x half> %3, %4
-  %interleaved.vec = tail call <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half> %5, <vscale x 2 x half> %2)
+  %interleaved.vec = tail call <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half> %5, <vscale x 2 x half> %2)
   ret <vscale x 4 x half> %interleaved.vec
 }
 
@@ -54,10 +54,10 @@ define <vscale x 8 x half> @complex_mul_v8f16(<vscale x 8 x half> %a, <vscale x
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %a)
   %a.real = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %b)
   %b.real = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 4 x half> %b.imag, %a.real
@@ -66,7 +66,7 @@ entry:
   %3 = fmul fast <vscale x 4 x half> %b.real, %a.real
   %4 = fmul fast <vscale x 4 x half> %a.imag, %b.imag
   %5 = fsub fast <vscale x 4 x half> %3, %4
-  %interleaved.vec = tail call <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half> %5, <vscale x 4 x half> %2)
+  %interleaved.vec = tail call <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half> %5, <vscale x 4 x half> %2)
   ret <vscale x 8 x half> %interleaved.vec
 }
 ; Expected to transform
@@ -84,10 +84,10 @@ define <vscale x 16 x half> @complex_mul_v16f16(<vscale x 16 x half> %a, <vscale
 ; CHECK-NEXT:    mov z0.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %a)
   %a.real = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %b)
   %b.real = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 8 x half> %b.imag, %a.real
@@ -96,7 +96,7 @@ entry:
   %3 = fmul fast <vscale x 8 x half> %b.real, %a.real
   %4 = fmul fast <vscale x 8 x half> %a.imag, %b.imag
   %5 = fsub fast <vscale x 8 x half> %3, %4
-  %interleaved.vec = tail call <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half> %5, <vscale x 8 x half> %2)
+  %interleaved.vec = tail call <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half> %5, <vscale x 8 x half> %2)
   ret <vscale x 16 x half> %interleaved.vec
 }
 
@@ -123,10 +123,10 @@ define <vscale x 32 x half> @complex_mul_v32f16(<vscale x 32 x half> %a, <vscale
 ; CHECK-NEXT:    mov z2.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %a)
+  %a.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %a)
   %a.real = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %b)
+  %b.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %b)
   %b.real = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 16 x half> %b.imag, %a.real
@@ -135,20 +135,20 @@ entry:
   %3 = fmul fast <vscale x 16 x half> %b.real, %a.real
   %4 = fmul fast <vscale x 16 x half> %a.imag, %b.imag
   %5 = fsub fast <vscale x 16 x half> %3, %4
-  %interleaved.vec = tail call <vscale x 32 x half> @llvm.experimental.vector.interleave2.nxv32f16(<vscale x 16 x half> %5, <vscale x 16 x half> %2)
+  %interleaved.vec = tail call <vscale x 32 x half> @llvm.vector.interleave2.nxv32f16(<vscale x 16 x half> %5, <vscale x 16 x half> %2)
   ret <vscale x 32 x half> %interleaved.vec
 }
 
-declare { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
-declare <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
+declare { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
+declare <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
 
-declare { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
-declare <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
+declare { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
+declare <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
 
-declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
-declare <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
+declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
+declare <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
 
-declare { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half>)
-declare <vscale x 32 x half> @llvm.experimental.vector.interleave2.nxv32f16(<vscale x 16 x half>, <vscale x 16 x half>)
+declare { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.vector.deinterleave2.nxv32f16(<vscale x 32 x half>)
+declare <vscale x 32 x half> @llvm.vector.interleave2.nxv32f16(<vscale x 16 x half>, <vscale x 16 x half>)
 
 
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add-scalable.ll
index ab764a58a770..47ad9ea2451a 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add-scalable.ll
@@ -12,15 +12,15 @@ define <vscale x 4 x float> @complex_add_v4f32(<vscale x 4 x float> %a, <vscale
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %a)
   %a.real = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %b)
   %b.real = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 2 x float> %b.real, %a.imag
   %1 = fadd fast <vscale x 2 x float> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1)
+  %interleaved.vec = tail call <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1)
   ret <vscale x 4 x float> %interleaved.vec
 }
 
@@ -35,15 +35,15 @@ define <vscale x 8 x float> @complex_add_v8f32(<vscale x 8 x float> %a, <vscale
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %a)
   %a.real = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %b)
   %b.real = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 4 x float> %b.real, %a.imag
   %1 = fadd fast <vscale x 4 x float> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1)
+  %interleaved.vec = tail call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1)
   ret <vscale x 8 x float> %interleaved.vec
 }
 ; Expected to transform
@@ -61,23 +61,23 @@ define <vscale x 16 x float> @complex_add_v16f32(<vscale x 16 x float> %a, <vsca
 ; CHECK-NEXT:    mov z3.d, z7.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %a)
   %a.real = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %b)
   %b.real = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 8 x float> %b.real, %a.imag
   %1 = fadd fast <vscale x 8 x float> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 16 x float> @llvm.experimental.vector.interleave2.nxv16f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1)
+  %interleaved.vec = tail call <vscale x 16 x float> @llvm.vector.interleave2.nxv16f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1)
   ret <vscale x 16 x float> %interleaved.vec
 }
 
-declare { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
-declare <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
+declare { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
 
-declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
-declare <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
+declare <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
 
-declare { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float>)
-declare <vscale x 16 x float> @llvm.experimental.vector.interleave2.nxv16f32(<vscale x 8 x float>, <vscale x 8 x float>)
+declare { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.vector.deinterleave2.nxv16f32(<vscale x 16 x float>)
+declare <vscale x 16 x float> @llvm.vector.interleave2.nxv16f32(<vscale x 8 x float>, <vscale x 8 x float>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll
index 1e2afb78de1b..bcd46aa182b5 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll
@@ -14,10 +14,10 @@ define <vscale x 4 x float> @complex_mul_v4f32(<vscale x 4 x float> %a, <vscale
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %a)
   %a.real = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %b)
   %b.real = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 2 x float> %b.imag, %a.real
@@ -26,7 +26,7 @@ entry:
   %3 = fmul fast <vscale x 2 x float> %b.real, %a.real
   %4 = fmul fast <vscale x 2 x float> %a.imag, %b.imag
   %5 = fsub fast <vscale x 2 x float> %3, %4
-  %interleaved.vec = tail call <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float> %5, <vscale x 2 x float> %2)
+  %interleaved.vec = tail call <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float> %5, <vscale x 2 x float> %2)
   ret <vscale x 4 x float> %interleaved.vec
 }
 
@@ -45,10 +45,10 @@ define <vscale x 8 x float> @complex_mul_v8f32(<vscale x 8 x float> %a, <vscale
 ; CHECK-NEXT:    mov z0.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %a)
   %a.real = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %b)
   %b.real = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 4 x float> %b.imag, %a.real
@@ -57,7 +57,7 @@ entry:
   %3 = fmul fast <vscale x 4 x float> %b.real, %a.real
   %4 = fmul fast <vscale x 4 x float> %a.imag, %b.imag
   %5 = fsub fast <vscale x 4 x float> %3, %4
-  %interleaved.vec = tail call <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float> %5, <vscale x 4 x float> %2)
+  %interleaved.vec = tail call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %5, <vscale x 4 x float> %2)
   ret <vscale x 8 x float> %interleaved.vec
 }
 
@@ -84,10 +84,10 @@ define <vscale x 16 x float> @complex_mul_v16f32(<vscale x 16 x float> %a, <vsca
 ; CHECK-NEXT:    mov z2.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %a)
   %a.real = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %b)
   %b.real = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 8 x float> %b.imag, %a.real
@@ -96,16 +96,16 @@ entry:
   %3 = fmul fast <vscale x 8 x float> %b.real, %a.real
   %4 = fmul fast <vscale x 8 x float> %a.imag, %b.imag
   %5 = fsub fast <vscale x 8 x float> %3, %4
-  %interleaved.vec = tail call <vscale x 16 x float> @llvm.experimental.vector.interleave2.nxv16f32(<vscale x 8 x float> %5, <vscale x 8 x float> %2)
+  %interleaved.vec = tail call <vscale x 16 x float> @llvm.vector.interleave2.nxv16f32(<vscale x 8 x float> %5, <vscale x 8 x float> %2)
   ret <vscale x 16 x float> %interleaved.vec
 }
 
-declare { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
-declare <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
+declare { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
 
-declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
-declare <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
+declare <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
 
-declare { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float>)
-declare <vscale x 16 x float> @llvm.experimental.vector.interleave2.nxv16f32(<vscale x 8 x float>, <vscale x 8 x float>)
+declare { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.vector.deinterleave2.nxv16f32(<vscale x 16 x float>)
+declare <vscale x 16 x float> @llvm.vector.interleave2.nxv16f32(<vscale x 8 x float>, <vscale x 8 x float>)
 
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add-scalable.ll
index 46a15f489d2b..c992d63ca283 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add-scalable.ll
@@ -12,15 +12,15 @@ define <vscale x 2 x double> @complex_add_v2f64(<vscale x 2 x double> %a, <vscal
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.experimental.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %a)
+  %a.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %a)
   %a.real = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.experimental.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %b)
+  %b.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %b)
   %b.real = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 1 x double> %b.real, %a.imag
   %1 = fadd fast <vscale x 1 x double> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 2 x double> @llvm.experimental.vector.interleave2.nxv2f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1)
+  %interleaved.vec = tail call <vscale x 2 x double> @llvm.vector.interleave2.nxv2f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1)
   ret <vscale x 2 x double> %interleaved.vec
 }
 
@@ -35,15 +35,15 @@ define <vscale x 4 x double> @complex_add_v4f64(<vscale x 4 x double> %a, <vscal
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %a.real = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %b.real = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 2 x double> %b.real, %a.imag
   %1 = fadd fast <vscale x 2 x double> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -62,23 +62,23 @@ define <vscale x 8 x double> @complex_add_v8f64(<vscale x 8 x double> %a, <vscal
 ; CHECK-NEXT:    mov z3.d, z7.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %a)
   %a.real = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %b)
   %b.real = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 4 x double> %b.real, %a.imag
   %1 = fadd fast <vscale x 4 x double> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 8 x double> @llvm.experimental.vector.interleave2.nxv8f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1)
+  %interleaved.vec = tail call <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1)
   ret <vscale x 8 x double> %interleaved.vec
 }
 
-declare { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.experimental.vector.deinterleave2.nxv2f64(<vscale x 2 x double>)
-declare <vscale x 2 x double> @llvm.experimental.vector.interleave2.nxv2f64(<vscale x 1 x double>, <vscale x 1 x double>)
+declare { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.vector.deinterleave2.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.vector.interleave2.nxv2f64(<vscale x 1 x double>, <vscale x 1 x double>)
 
-declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
 
-declare { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double>)
-declare <vscale x 8 x double> @llvm.experimental.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)
+declare { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double>)
+declare <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll
index 17a239a09a03..db28fa3997cb 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll
@@ -14,10 +14,10 @@ define <vscale x 2 x double> @complex_mul_v2f64(<vscale x 2 x double> %a, <vscal
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.experimental.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %a)
+  %a.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %a)
   %a.real = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.experimental.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %b)
+  %b.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %b)
   %b.real = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 1 x double> %b.imag, %a.real
@@ -26,7 +26,7 @@ entry:
   %3 = fmul fast <vscale x 1 x double> %b.real, %a.real
   %4 = fmul fast <vscale x 1 x double> %a.imag, %b.imag
   %5 = fsub fast <vscale x 1 x double> %3, %4
-  %interleaved.vec = tail call <vscale x 2 x double> @llvm.experimental.vector.interleave2.nxv2f64(<vscale x 1 x double> %5, <vscale x 1 x double> %2)
+  %interleaved.vec = tail call <vscale x 2 x double> @llvm.vector.interleave2.nxv2f64(<vscale x 1 x double> %5, <vscale x 1 x double> %2)
   ret <vscale x 2 x double> %interleaved.vec
 }
 
@@ -45,10 +45,10 @@ define <vscale x 4 x double> @complex_mul_v4f64(<vscale x 4 x double> %a, <vscal
 ; CHECK-NEXT:    mov z0.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %a.real = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %b.real = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 2 x double> %b.imag, %a.real
@@ -57,7 +57,7 @@ entry:
   %3 = fmul fast <vscale x 2 x double> %b.real, %a.real
   %4 = fmul fast <vscale x 2 x double> %a.imag, %b.imag
   %5 = fsub fast <vscale x 2 x double> %3, %4
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %5, <vscale x 2 x double> %2)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %5, <vscale x 2 x double> %2)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -84,10 +84,10 @@ define <vscale x 8 x double> @complex_mul_v8f64(<vscale x 8 x double> %a, <vscal
 ; CHECK-NEXT:    mov z2.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %a)
   %a.real = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %b)
   %b.real = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 4 x double> %b.imag, %a.real
@@ -96,15 +96,15 @@ entry:
   %3 = fmul fast <vscale x 4 x double> %b.real, %a.real
   %4 = fmul fast <vscale x 4 x double> %a.imag, %b.imag
   %5 = fsub fast <vscale x 4 x double> %3, %4
-  %interleaved.vec = tail call <vscale x 8 x double> @llvm.experimental.vector.interleave2.nxv8f64(<vscale x 4 x double> %5, <vscale x 4 x double> %2)
+  %interleaved.vec = tail call <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double> %5, <vscale x 4 x double> %2)
   ret <vscale x 8 x double> %interleaved.vec
 }
 
-declare { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.experimental.vector.deinterleave2.nxv2f64(<vscale x 2 x double>)
-declare <vscale x 2 x double> @llvm.experimental.vector.interleave2.nxv2f64(<vscale x 1 x double>, <vscale x 1 x double>)
+declare { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.vector.deinterleave2.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.vector.interleave2.nxv2f64(<vscale x 1 x double>, <vscale x 1 x double>)
 
-declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
 
-declare { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double>)
-declare <vscale x 8 x double> @llvm.experimental.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)
+declare { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double>)
+declare <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll
index 001046f8f397..f0569674c651 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll
@@ -22,15 +22,15 @@ define <vscale x 4 x i16> @complex_add_v4i16(<vscale x 4 x i16> %a, <vscale x 4
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.experimental.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %a)
   %a.real = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.experimental.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %b)
   %b.real = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %b.deinterleaved, 1
   %0 = sub <vscale x 2 x i16> %b.real, %a.imag
   %1 = add <vscale x 2 x i16> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 4 x i16> @llvm.experimental.vector.interleave2.nxv4i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1)
+  %interleaved.vec = tail call <vscale x 4 x i16> @llvm.vector.interleave2.nxv4i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1)
   ret <vscale x 4 x i16> %interleaved.vec
 }
 
@@ -42,15 +42,15 @@ define <vscale x 8 x i16> @complex_add_v8i16(<vscale x 8 x i16> %a, <vscale x 8
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %a)
   %a.real = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %b)
   %b.real = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %b.deinterleaved, 1
   %0 = sub <vscale x 4 x i16> %b.real, %a.imag
   %1 = add <vscale x 4 x i16> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 8 x i16> @llvm.experimental.vector.interleave2.nxv8i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1)
+  %interleaved.vec = tail call <vscale x 8 x i16> @llvm.vector.interleave2.nxv8i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1)
   ret <vscale x 8 x i16> %interleaved.vec
 }
 
@@ -64,15 +64,15 @@ define <vscale x 16 x i16> @complex_add_v16i16(<vscale x 16 x i16> %a, <vscale x
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %a)
   %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %b)
   %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
   %0 = sub <vscale x 8 x i16> %b.real, %a.imag
   %1 = add <vscale x 8 x i16> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1)
+  %interleaved.vec = tail call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1)
   ret <vscale x 16 x i16> %interleaved.vec
 }
 
@@ -90,26 +90,26 @@ define <vscale x 32 x i16> @complex_add_v32i16(<vscale x 32 x i16> %a, <vscale x
 ; CHECK-NEXT:    mov z3.d, z7.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %a)
+  %a.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %a)
   %a.real = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %b)
+  %b.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %b)
   %b.real = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %b.deinterleaved, 1
   %0 = sub <vscale x 16 x i16> %b.real, %a.imag
   %1 = add <vscale x 16 x i16> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 32 x i16> @llvm.experimental.vector.interleave2.nxv32i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1)
+  %interleaved.vec = tail call <vscale x 32 x i16> @llvm.vector.interleave2.nxv32i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1)
   ret <vscale x 32 x i16> %interleaved.vec
 }
 
-declare { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.experimental.vector.deinterleave2.nxv4i16(<vscale x 4 x i16>)
-declare <vscale x 4 x i16> @llvm.experimental.vector.interleave2.nxv4i16(<vscale x 2 x i16>, <vscale x 2 x i16>)
+declare { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.vector.deinterleave2.nxv4i16(<vscale x 4 x i16>)
+declare <vscale x 4 x i16> @llvm.vector.interleave2.nxv4i16(<vscale x 2 x i16>, <vscale x 2 x i16>)
 
-declare { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16>)
-declare <vscale x 8 x i16> @llvm.experimental.vector.interleave2.nxv8i16(<vscale x 4 x i16>, <vscale x 4 x i16>)
+declare { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16>)
+declare <vscale x 8 x i16> @llvm.vector.interleave2.nxv8i16(<vscale x 4 x i16>, <vscale x 4 x i16>)
 
-declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
+declare <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
 
-declare { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16>)
-declare <vscale x 32 x i16> @llvm.experimental.vector.interleave2.nxv32i16(<vscale x 16 x i16>, <vscale x 16 x i16>)
+declare { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.vector.deinterleave2.nxv32i16(<vscale x 32 x i16>)
+declare <vscale x 32 x i16> @llvm.vector.interleave2.nxv32i16(<vscale x 16 x i16>, <vscale x 16 x i16>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
index 07488b623b98..b4cb548f6308 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
@@ -26,10 +26,10 @@ define <vscale x 4 x i16> @complex_mul_v4i16(<vscale x 4 x i16> %a, <vscale x 4
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.experimental.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %a)
   %a.real = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.experimental.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %b)
   %b.real = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %b.deinterleaved, 1
   %0 = mul <vscale x 2 x i16> %b.imag, %a.real
@@ -38,7 +38,7 @@ entry:
   %3 = mul <vscale x 2 x i16> %b.real, %a.real
   %4 = mul <vscale x 2 x i16> %a.imag, %b.imag
   %5 = sub <vscale x 2 x i16> %3, %4
-  %interleaved.vec = tail call <vscale x 4 x i16> @llvm.experimental.vector.interleave2.nxv4i16(<vscale x 2 x i16> %5, <vscale x 2 x i16> %2)
+  %interleaved.vec = tail call <vscale x 4 x i16> @llvm.vector.interleave2.nxv4i16(<vscale x 2 x i16> %5, <vscale x 2 x i16> %2)
   ret <vscale x 4 x i16> %interleaved.vec
 }
 
@@ -52,10 +52,10 @@ define <vscale x 8 x i16> @complex_mul_v8i16(<vscale x 8 x i16> %a, <vscale x 8
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %a)
   %a.real = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %b)
   %b.real = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %b.deinterleaved, 1
   %0 = mul <vscale x 4 x i16> %b.imag, %a.real
@@ -64,7 +64,7 @@ entry:
   %3 = mul <vscale x 4 x i16> %b.real, %a.real
   %4 = mul <vscale x 4 x i16> %a.imag, %b.imag
   %5 = sub <vscale x 4 x i16> %3, %4
-  %interleaved.vec = tail call <vscale x 8 x i16> @llvm.experimental.vector.interleave2.nxv8i16(<vscale x 4 x i16> %5, <vscale x 4 x i16> %2)
+  %interleaved.vec = tail call <vscale x 8 x i16> @llvm.vector.interleave2.nxv8i16(<vscale x 4 x i16> %5, <vscale x 4 x i16> %2)
   ret <vscale x 8 x i16> %interleaved.vec
 }
 ; Expected to transform
@@ -81,10 +81,10 @@ define <vscale x 16 x i16> @complex_mul_v16i16(<vscale x 16 x i16> %a, <vscale x
 ; CHECK-NEXT:    mov z0.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %a)
   %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %b)
   %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
   %0 = mul <vscale x 8 x i16> %b.imag, %a.real
@@ -93,7 +93,7 @@ entry:
   %3 = mul <vscale x 8 x i16> %b.real, %a.real
   %4 = mul <vscale x 8 x i16> %a.imag, %b.imag
   %5 = sub <vscale x 8 x i16> %3, %4
-  %interleaved.vec = tail call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %5, <vscale x 8 x i16> %2)
+  %interleaved.vec = tail call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %5, <vscale x 8 x i16> %2)
   ret <vscale x 16 x i16> %interleaved.vec
 }
 
@@ -119,10 +119,10 @@ define <vscale x 32 x i16> @complex_mul_v32i16(<vscale x 32 x i16> %a, <vscale x
 ; CHECK-NEXT:    mov z2.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %a)
+  %a.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %a)
   %a.real = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %b)
+  %b.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %b)
   %b.real = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %b.deinterleaved, 1
   %0 = mul <vscale x 16 x i16> %b.imag, %a.real
@@ -131,20 +131,20 @@ entry:
   %3 = mul <vscale x 16 x i16> %b.real, %a.real
   %4 = mul <vscale x 16 x i16> %a.imag, %b.imag
   %5 = sub <vscale x 16 x i16> %3, %4
-  %interleaved.vec = tail call <vscale x 32 x i16> @llvm.experimental.vector.interleave2.nxv32i16(<vscale x 16 x i16> %5, <vscale x 16 x i16> %2)
+  %interleaved.vec = tail call <vscale x 32 x i16> @llvm.vector.interleave2.nxv32i16(<vscale x 16 x i16> %5, <vscale x 16 x i16> %2)
   ret <vscale x 32 x i16> %interleaved.vec
 }
 
-declare { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.experimental.vector.deinterleave2.nxv4i16(<vscale x 4 x i16>)
-declare <vscale x 4 x i16> @llvm.experimental.vector.interleave2.nxv4i16(<vscale x 2 x i16>, <vscale x 2 x i16>)
+declare { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.vector.deinterleave2.nxv4i16(<vscale x 4 x i16>)
+declare <vscale x 4 x i16> @llvm.vector.interleave2.nxv4i16(<vscale x 2 x i16>, <vscale x 2 x i16>)
 
-declare { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16>)
-declare <vscale x 8 x i16> @llvm.experimental.vector.interleave2.nxv8i16(<vscale x 4 x i16>, <vscale x 4 x i16>)
+declare { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16>)
+declare <vscale x 8 x i16> @llvm.vector.interleave2.nxv8i16(<vscale x 4 x i16>, <vscale x 4 x i16>)
 
-declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
+declare <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
 
-declare { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16>)
-declare <vscale x 32 x i16> @llvm.experimental.vector.interleave2.nxv32i16(<vscale x 16 x i16>, <vscale x 16 x i16>)
+declare { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.vector.deinterleave2.nxv32i16(<vscale x 32 x i16>)
+declare <vscale x 32 x i16> @llvm.vector.interleave2.nxv32i16(<vscale x 16 x i16>, <vscale x 16 x i16>)
 
 
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll
index 1ce480bbf3d8..458cd62269f8 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll
@@ -11,15 +11,15 @@ define <vscale x 4 x i32> @complex_add_v4i32(<vscale x 4 x i32> %a, <vscale x 4
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %a)
   %a.real = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %b)
   %b.real = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %b.deinterleaved, 1
   %0 = sub <vscale x 2 x i32> %b.real, %a.imag
   %1 = add <vscale x 2 x i32> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 4 x i32> @llvm.experimental.vector.interleave2.nxv4i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1)
+  %interleaved.vec = tail call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1)
   ret <vscale x 4 x i32> %interleaved.vec
 }
 
@@ -33,15 +33,15 @@ define <vscale x 8 x i32> @complex_add_v8i32(<vscale x 8 x i32> %a, <vscale x 8
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %a)
   %a.real = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %b)
   %b.real = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %b.deinterleaved, 1
   %0 = sub <vscale x 4 x i32> %b.real, %a.imag
   %1 = add <vscale x 4 x i32> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1)
+  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1)
   ret <vscale x 8 x i32> %interleaved.vec
 }
 
@@ -59,23 +59,23 @@ define <vscale x 16 x i32> @complex_add_v16i32(<vscale x 16 x i32> %a, <vscale x
 ; CHECK-NEXT:    mov z3.d, z7.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %a)
   %a.real = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %b)
   %b.real = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %b.deinterleaved, 1
   %0 = sub <vscale x 8 x i32> %b.real, %a.imag
   %1 = add <vscale x 8 x i32> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1)
+  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1)
   ret <vscale x 16 x i32> %interleaved.vec
 }
 
-declare { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.interleave2.nxv4i32(<vscale x 2 x i32>, <vscale x 2 x i32>)
+declare { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32>, <vscale x 2 x i32>)
 
-declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+declare <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
 
-declare { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32>)
-declare <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32>, <vscale x 8 x i32>)
+declare { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32>)
+declare <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32>, <vscale x 8 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll
index d88eef9800d7..4cfe4707b9a9 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll
@@ -13,10 +13,10 @@ define <vscale x 4 x i32> @complex_mul_v4i32(<vscale x 4 x i32> %a, <vscale x 4
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %a)
   %a.real = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %b)
   %b.real = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %b.deinterleaved, 1
   %0 = mul <vscale x 2 x i32> %b.imag, %a.real
@@ -25,7 +25,7 @@ entry:
   %3 = mul <vscale x 2 x i32> %b.real, %a.real
   %4 = mul <vscale x 2 x i32> %a.imag, %b.imag
   %5 = sub <vscale x 2 x i32> %3, %4
-  %interleaved.vec = tail call <vscale x 4 x i32> @llvm.experimental.vector.interleave2.nxv4i32(<vscale x 2 x i32> %5, <vscale x 2 x i32> %2)
+  %interleaved.vec = tail call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %5, <vscale x 2 x i32> %2)
   ret <vscale x 4 x i32> %interleaved.vec
 }
 
@@ -43,10 +43,10 @@ define <vscale x 8 x i32> @complex_mul_v8i32(<vscale x 8 x i32> %a, <vscale x 8
 ; CHECK-NEXT:    mov z0.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %a)
   %a.real = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %b)
   %b.real = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %b.deinterleaved, 1
   %0 = mul <vscale x 4 x i32> %b.imag, %a.real
@@ -55,7 +55,7 @@ entry:
   %3 = mul <vscale x 4 x i32> %b.real, %a.real
   %4 = mul <vscale x 4 x i32> %a.imag, %b.imag
   %5 = sub <vscale x 4 x i32> %3, %4
-  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %5, <vscale x 4 x i32> %2)
+  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %5, <vscale x 4 x i32> %2)
   ret <vscale x 8 x i32> %interleaved.vec
 }
 
@@ -81,10 +81,10 @@ define <vscale x 16 x i32> @complex_mul_v16i32(<vscale x 16 x i32> %a, <vscale x
 ; CHECK-NEXT:    mov z2.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %a)
   %a.real = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %b)
   %b.real = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %b.deinterleaved, 1
   %0 = mul <vscale x 8 x i32> %b.imag, %a.real
@@ -93,16 +93,16 @@ entry:
   %3 = mul <vscale x 8 x i32> %b.real, %a.real
   %4 = mul <vscale x 8 x i32> %a.imag, %b.imag
   %5 = sub <vscale x 8 x i32> %3, %4
-  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %5, <vscale x 8 x i32> %2)
+  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %5, <vscale x 8 x i32> %2)
   ret <vscale x 16 x i32> %interleaved.vec
 }
 
-declare { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.interleave2.nxv4i32(<vscale x 2 x i32>, <vscale x 2 x i32>)
+declare { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32>, <vscale x 2 x i32>)
 
-declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+declare <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
 
-declare { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32>)
-declare <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32>, <vscale x 8 x i32>)
+declare { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32>)
+declare <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32>, <vscale x 8 x i32>)
 
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll
index 0b59be9414fa..f06b55c68b7e 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll
@@ -11,15 +11,15 @@ define <vscale x 2 x i64> @complex_add_v2i64(<vscale x 2 x i64> %a, <vscale x 2
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.experimental.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %a)
+  %a.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %a)
   %a.real = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.experimental.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %b)
+  %b.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %b)
   %b.real = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %b.deinterleaved, 1
   %0 = sub <vscale x 1 x i64> %b.real, %a.imag
   %1 = add <vscale x 1 x i64> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 2 x i64> @llvm.experimental.vector.interleave2.nxv2i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1)
+  %interleaved.vec = tail call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1)
   ret <vscale x 2 x i64> %interleaved.vec
 }
 
@@ -33,15 +33,15 @@ define <vscale x 4 x i64> @complex_add_v4i64(<vscale x 4 x i64> %a, <vscale x 4
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %a)
   %a.real = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %b)
   %b.real = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %b.deinterleaved, 1
   %0 = sub <vscale x 2 x i64> %b.real, %a.imag
   %1 = add <vscale x 2 x i64> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1)
+  %interleaved.vec = tail call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1)
   ret <vscale x 4 x i64> %interleaved.vec
 }
 
@@ -59,23 +59,23 @@ define <vscale x 8 x i64> @complex_add_v8i64(<vscale x 8 x i64> %a, <vscale x 8
 ; CHECK-NEXT:    mov z3.d, z7.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %a)
   %a.real = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %b)
   %b.real = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %b.deinterleaved, 1
   %0 = sub <vscale x 4 x i64> %b.real, %a.imag
   %1 = add <vscale x 4 x i64> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1)
+  %interleaved.vec = tail call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1)
   ret <vscale x 8 x i64> %interleaved.vec
 }
 
-declare { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.experimental.vector.deinterleave2.nxv2i64(<vscale x 2 x i64>)
-declare <vscale x 2 x i64> @llvm.experimental.vector.interleave2.nxv2i64(<vscale x 1 x i64>, <vscale x 1 x i64>)
+declare { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64>, <vscale x 1 x i64>)
 
-declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
+declare <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
 
-declare { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
-declare <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
+declare { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
+declare <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll
index 16e1f3e63dce..5975f3b491d4 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll
@@ -13,10 +13,10 @@ define <vscale x 2 x i64> @complex_mul_v2i64(<vscale x 2 x i64> %a, <vscale x 2
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.experimental.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %a)
+  %a.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %a)
   %a.real = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.experimental.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %b)
+  %b.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %b)
   %b.real = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %b.deinterleaved, 1
   %0 = mul <vscale x 1 x i64> %b.imag, %a.real
@@ -25,7 +25,7 @@ entry:
   %3 = mul <vscale x 1 x i64> %b.real, %a.real
   %4 = mul <vscale x 1 x i64> %a.imag, %b.imag
   %5 = sub <vscale x 1 x i64> %3, %4
-  %interleaved.vec = tail call <vscale x 2 x i64> @llvm.experimental.vector.interleave2.nxv2i64(<vscale x 1 x i64> %5, <vscale x 1 x i64> %2)
+  %interleaved.vec = tail call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> %5, <vscale x 1 x i64> %2)
   ret <vscale x 2 x i64> %interleaved.vec
 }
 
@@ -43,10 +43,10 @@ define <vscale x 4 x i64> @complex_mul_v4i64(<vscale x 4 x i64> %a, <vscale x 4
 ; CHECK-NEXT:    mov z0.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %a)
   %a.real = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %b)
   %b.real = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %b.deinterleaved, 1
   %0 = mul <vscale x 2 x i64> %b.imag, %a.real
@@ -55,7 +55,7 @@ entry:
   %3 = mul <vscale x 2 x i64> %b.real, %a.real
   %4 = mul <vscale x 2 x i64> %a.imag, %b.imag
   %5 = sub <vscale x 2 x i64> %3, %4
-  %interleaved.vec = tail call <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64> %5, <vscale x 2 x i64> %2)
+  %interleaved.vec = tail call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %5, <vscale x 2 x i64> %2)
   ret <vscale x 4 x i64> %interleaved.vec
 }
 
@@ -81,10 +81,10 @@ define <vscale x 8 x i64> @complex_mul_v8i64(<vscale x 8 x i64> %a, <vscale x 8
 ; CHECK-NEXT:    mov z2.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %a)
   %a.real = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %b)
   %b.real = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %b.deinterleaved, 1
   %0 = mul <vscale x 4 x i64> %b.imag, %a.real
@@ -93,7 +93,7 @@ entry:
   %3 = mul <vscale x 4 x i64> %b.real, %a.real
   %4 = mul <vscale x 4 x i64> %a.imag, %b.imag
   %5 = sub <vscale x 4 x i64> %3, %4
-  %interleaved.vec = tail call <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64> %5, <vscale x 4 x i64> %2)
+  %interleaved.vec = tail call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> %5, <vscale x 4 x i64> %2)
   ret <vscale x 8 x i64> %interleaved.vec
 }
 
@@ -119,11 +119,11 @@ define <vscale x 8 x i64> @complex_minus_mul_v8i64(<vscale x 8 x i64> %a, <vscal
 ; CHECK-NEXT:    mov z2.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %a)
   %a.real = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %a.deinterleaved, 1
   %0 = sub <vscale x 4 x i64> zeroinitializer, %a.real
-  %b.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %b)
   %b.real = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %b.deinterleaved, 1
   %1 = mul <vscale x 4 x i64> %b.real, %0
@@ -132,15 +132,15 @@ entry:
   %4 = mul <vscale x 4 x i64> %b.real, %a.imag
   %5 = mul <vscale x 4 x i64> %b.imag, %0
   %6 = sub <vscale x 4 x i64> %5, %4
-  %interleaved.vec = tail call <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64> %3, <vscale x 4 x i64> %6)
+  %interleaved.vec = tail call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> %3, <vscale x 4 x i64> %6)
   ret <vscale x 8 x i64> %interleaved.vec
 }
 
-declare { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.experimental.vector.deinterleave2.nxv2i64(<vscale x 2 x i64>)
-declare <vscale x 2 x i64> @llvm.experimental.vector.interleave2.nxv2i64(<vscale x 1 x i64>, <vscale x 1 x i64>)
+declare { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64>, <vscale x 1 x i64>)
 
-declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
+declare <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
 
-declare { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
-declare <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
+declare { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
+declare <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll
index b631486137e6..81872c1723f2 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll
@@ -22,15 +22,15 @@ define <vscale x 8 x i8> @complex_add_v8i8(<vscale x 8 x i8> %a, <vscale x 8 x i
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.experimental.vector.deinterleave2.nxv8i8(<vscale x 8 x i8> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.vector.deinterleave2.nxv8i8(<vscale x 8 x i8> %a)
   %a.real = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.experimental.vector.deinterleave2.nxv8i8(<vscale x 8 x i8> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.vector.deinterleave2.nxv8i8(<vscale x 8 x i8> %b)
   %b.real = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } %b.deinterleaved, 1
   %0 = sub <vscale x 4 x i8> %b.real, %a.imag
   %1 = add <vscale x 4 x i8> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 8 x i8> @llvm.experimental.vector.interleave2.nxv8i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1)
+  %interleaved.vec = tail call <vscale x 8 x i8> @llvm.vector.interleave2.nxv8i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1)
   ret <vscale x 8 x i8> %interleaved.vec
 }
 
@@ -42,15 +42,15 @@ define <vscale x 16 x i8> @complex_add_v16i8(<vscale x 16 x i8> %a, <vscale x 16
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.experimental.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %a)
   %a.real = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.experimental.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %b)
   %b.real = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } %b.deinterleaved, 1
   %0 = sub <vscale x 8 x i8> %b.real, %a.imag
   %1 = add <vscale x 8 x i8> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 16 x i8> @llvm.experimental.vector.interleave2.nxv16i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1)
+  %interleaved.vec = tail call <vscale x 16 x i8> @llvm.vector.interleave2.nxv16i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1)
   ret <vscale x 16 x i8> %interleaved.vec
 }
 
@@ -64,23 +64,23 @@ define <vscale x 32 x i8> @complex_add_v32i8(<vscale x 32 x i8> %a, <vscale x 32
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %a)
+  %a.deinterleaved = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %a)
   %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %b)
+  %b.deinterleaved = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %b)
   %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
   %0 = sub <vscale x 16 x i8> %b.real, %a.imag
   %1 = add <vscale x 16 x i8> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1)
+  %interleaved.vec = tail call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1)
   ret <vscale x 32 x i8> %interleaved.vec
 }
 
-declare { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.experimental.vector.deinterleave2.nxv8i8(<vscale x 8 x i8>)
-declare <vscale x 8 x i8> @llvm.experimental.vector.interleave2.nxv8i8(<vscale x 4 x i8>, <vscale x 4 x i8>)
+declare { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.vector.deinterleave2.nxv8i8(<vscale x 8 x i8>)
+declare <vscale x 8 x i8> @llvm.vector.interleave2.nxv8i8(<vscale x 4 x i8>, <vscale x 4 x i8>)
 
-declare { <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.experimental.vector.deinterleave2.nxv16i8(<vscale x 16 x i8>)
-declare <vscale x 16 x i8> @llvm.experimental.vector.interleave2.nxv16i8(<vscale x 8 x i8>, <vscale x 8 x i8>)
+declare { <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8>)
+declare <vscale x 16 x i8> @llvm.vector.interleave2.nxv16i8(<vscale x 8 x i8>, <vscale x 8 x i8>)
 
-declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
-declare <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
+declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
+declare <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
index 19318fdeeca7..ac2b21af29ab 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
@@ -69,14 +69,14 @@ vector.body:                                      ; preds = %vector.body, %entry
   %vec.phi27 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %16, %vector.body ]
   %scevgep = getelementptr i8, ptr %a, i64 %lsr.iv
   %scevgep34 = getelementptr i8, ptr %b, i64 %lsr.iv
-  %interleaved.mask = tail call <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %active.lane.mask)
+  %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %active.lane.mask)
   %wide.masked.vec = tail call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %scevgep, i32 8, <vscale x 4 x i1> %interleaved.mask, <vscale x 4 x double> poison)
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec)
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %4 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %interleaved.mask28 = tail call <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %active.lane.mask)
+  %interleaved.mask28 = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %active.lane.mask)
   %wide.masked.vec29 = tail call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %scevgep34, i32 8, <vscale x 4 x i1> %interleaved.mask28, <vscale x 4 x double> poison)
-  %strided.vec30 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec29)
+  %strided.vec30 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec29)
   %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec30, 0
   %6 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec30, 1
   %7 = fmul fast <vscale x 2 x double> %6, %3
@@ -175,13 +175,13 @@ vector.body:                                      ; preds = %vector.body, %entry
   %4 = icmp ne <vscale x 2 x i32> %wide.load, zeroinitializer
   %scevgep49 = getelementptr i8, ptr %a, i64 %lsr.iv48
   %scevgep50 = getelementptr i8, ptr %b, i64 %lsr.iv48
-  %interleaved.mask = tail call <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1> %4, <vscale x 2 x i1> %4)
+  %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %4, <vscale x 2 x i1> %4)
   %wide.masked.vec = tail call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %scevgep49, i32 8, <vscale x 4 x i1> %interleaved.mask, <vscale x 4 x double> poison)
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec)
   %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %6 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
   %wide.masked.vec32 = tail call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %scevgep50, i32 8, <vscale x 4 x i1> %interleaved.mask, <vscale x 4 x double> poison)
-  %strided.vec33 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec32)
+  %strided.vec33 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec32)
   %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec33, 0
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec33, 1
   %9 = fmul fast <vscale x 2 x double> %8, %5
@@ -279,14 +279,14 @@ vector.body:                                      ; preds = %vector.body, %entry
   %scevgep38 = getelementptr i8, ptr %a, i64 %lsr.iv
   %scevgep39 = getelementptr i8, ptr %b, i64 %lsr.iv
   %5 = select <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %4, <vscale x 2 x i1> zeroinitializer
-  %interleaved.mask = tail call <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1> %5, <vscale x 2 x i1> %5)
+  %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %5, <vscale x 2 x i1> %5)
   %wide.masked.vec = tail call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %scevgep38, i32 8, <vscale x 4 x i1> %interleaved.mask, <vscale x 4 x double> poison)
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec)
   %6 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %interleaved.mask31 = tail call <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1> %5, <vscale x 2 x i1> %5)
+  %interleaved.mask31 = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %5, <vscale x 2 x i1> %5)
   %wide.masked.vec32 = tail call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %scevgep39, i32 8, <vscale x 4 x i1> %interleaved.mask31, <vscale x 4 x double> poison)
-  %strided.vec33 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec32)
+  %strided.vec33 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec32)
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec33, 0
   %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec33, 1
   %10 = fmul fast <vscale x 2 x double> %9, %6
@@ -320,6 +320,6 @@ declare i64 @llvm.vscale.i64()
 declare <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64, i64)
 declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr nocapture, i32 immarg, <vscale x 2 x i1>, <vscale x 2 x i32>)
 declare <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr nocapture, i32 immarg, <vscale x 4 x i1>, <vscale x 4 x double>)
-declare <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1>, <vscale x 2 x i1>)
-declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1>, <vscale x 2 x i1>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
 declare double @llvm.vector.reduce.fadd.nxv2f64(double, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
index 5bef95910d90..af07519ad53d 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
@@ -64,11 +64,11 @@ vector.body:                                      ; preds = %vector.body, %entry
   %scevgep46 = getelementptr i8, ptr %a, i64 %lsr.iv27
   %scevgep47 = getelementptr i8, ptr %b, i64 %lsr.iv27
   %wide.vec = load <vscale x 4 x double>, ptr %scevgep46, align 8
-  %3 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
+  %3 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
   %4 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %3, 0
   %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %3, 1
   %wide.vec30 = load <vscale x 4 x double>, ptr %scevgep47, align 8
-  %6 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec30)
+  %6 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec30)
   %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %6, 0
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %6, 1
   %9 = fmul fast <vscale x 2 x double> %8, %4
@@ -156,11 +156,11 @@ vector.body:                                      ; preds = %vector.body, %entry
   %scevgep46 = getelementptr i8, ptr %a, i64 %lsr.iv27
   %scevgep47 = getelementptr i8, ptr %b, i64 %lsr.iv27
   %wide.vec = load <vscale x 4 x double>, ptr %scevgep46, align 8
-  %3 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
+  %3 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
   %4 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %3, 0
   %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %3, 1
   %wide.vec30 = load <vscale x 4 x double>, ptr %scevgep47, align 8
-  %6 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec30)
+  %6 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec30)
   %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %6, 0
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %6, 1
   %9 = fmul fast <vscale x 2 x double> %8, %4
@@ -266,16 +266,16 @@ vector.body:                                      ; preds = %vector.body, %entry
   %scevgep62 = getelementptr i8, ptr %scevgep61, i64 %lsr.iv34
   %wide.vec = load <vscale x 4 x double>, ptr %scevgep57, align 8
   %wide.vec32 = load <vscale x 4 x double>, ptr %scevgep64, align 8
-  %4 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
-  %5 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec32)
+  %4 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
+  %5 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec32)
   %6 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %4, 0
   %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %5, 0
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %4, 1
   %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %5, 1
   %wide.vec34 = load <vscale x 4 x double>, ptr %scevgep58, align 8
   %wide.vec35 = load <vscale x 4 x double>, ptr %scevgep62, align 8
-  %10 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec34)
-  %11 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec35)
+  %10 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec34)
+  %11 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec35)
   %12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %10, 0
   %13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %11, 0
   %14 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %10, 1
@@ -375,7 +375,7 @@ vector.body:                                      ; preds = %vector.body, %entry
   %5 = add <vscale x 2 x i32> %wide.load, %vec.phi
   %6 = getelementptr inbounds %"class.std::complex", ptr %a, i64 %index
   %wide.vec = load <vscale x 4 x double>, ptr %6, align 8
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
   %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
   %9 = fadd fast <vscale x 2 x double> %7, %vec.phi13
@@ -396,6 +396,6 @@ middle.block:                                     ; preds = %vector.body
 
 
 declare i64 @llvm.vscale.i64()
-declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
 declare double @llvm.vector.reduce.fadd.nxv2f64(double, <vscale x 2 x double>)
 declare i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
index 17bf5ba6eb48..b4425c0c01e1 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
@@ -28,10 +28,10 @@ define <vscale x 4 x double> @complex_mul_const(<vscale x 4 x double> %a, <vscal
 ; CHECK-NEXT:    mov z1.d, z4.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec48 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec48 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec48, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec48, 1
   %4 = fmul fast <vscale x 2 x double> %3, %0
@@ -46,7 +46,7 @@ entry:
   %13 = fmul fast <vscale x 2 x double> %9, splat (double 1.100000e+01)
   %14 = fmul fast <vscale x 2 x double> %6, splat (double 3.000000e+00)
   %15 = fsub fast <vscale x 2 x double> %13, %14
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %15, <vscale x 2 x double> %12)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %15, <vscale x 2 x double> %12)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -83,10 +83,10 @@ entry:
   %broadcast.splat = shufflevector <vscale x 2 x double> %broadcast.splatinsert, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
   %broadcast.splatinsert49 = insertelement <vscale x 2 x double> poison, double %c.coerce.fca.0.extract, i64 0
   %broadcast.splat50 = shufflevector <vscale x 2 x double> %broadcast.splatinsert49, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec48 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec48 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec48, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec48, 1
   %4 = fmul fast <vscale x 2 x double> %3, %0
@@ -101,9 +101,9 @@ entry:
   %13 = fmul fast <vscale x 2 x double> %9, %broadcast.splat50
   %14 = fmul fast <vscale x 2 x double> %6, %broadcast.splat
   %15 = fsub fast <vscale x 2 x double> %13, %14
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %15, <vscale x 2 x double> %12)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %15, <vscale x 2 x double> %12)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
-declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
index 2ad5623b6551..c58db8290c87 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
@@ -25,7 +25,7 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 killed $q1
 ; CHECK-GI-NEXT:    fmov d0, d2
 ; CHECK-GI-NEXT:    ret
-  %retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec)
+  %retval = call {<2 x half>, <2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half> %vec)
   ret {<2 x half>, <2 x half>}   %retval
 }
 
@@ -45,7 +45,7 @@ define {<4 x half>, <4 x half>} @vector_deinterleave_v4f16_v8f16(<8 x half> %vec
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 killed $q1
 ; CHECK-GI-NEXT:    fmov d0, d2
 ; CHECK-GI-NEXT:    ret
-  %retval = call {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %vec)
+  %retval = call {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half> %vec)
   ret {<4 x half>, <4 x half>}   %retval
 }
 
@@ -56,7 +56,7 @@ define {<8 x half>, <8 x half>} @vector_deinterleave_v8f16_v16f16(<16 x half> %v
 ; CHECK-NEXT:    uzp2 v1.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %vec)
+  %retval = call {<8 x half>, <8 x half>} @llvm.vector.deinterleave2.v16f16(<16 x half> %vec)
   ret {<8 x half>, <8 x half>}   %retval
 }
 
@@ -76,7 +76,7 @@ define {<2 x float>, <2 x float>} @vector_deinterleave_v2f32_v4f32(<4 x float> %
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 killed $q1
 ; CHECK-GI-NEXT:    fmov d0, d2
 ; CHECK-GI-NEXT:    ret
-  %retval = call {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float> %vec)
+  %retval = call {<2 x float>, <2 x float>} @llvm.vector.deinterleave2.v4f32(<4 x float> %vec)
   ret {<2 x float>, <2 x float>}   %retval
 }
 
@@ -87,7 +87,7 @@ define {<4 x float>, <4 x float>} @vector_deinterleave_v4f32_v8f32(<8 x float> %
 ; CHECK-NEXT:    uzp2 v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %vec)
+  %retval = call {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float> %vec)
 ret  {<4 x float>, <4 x float>}   %retval
 }
 
@@ -98,7 +98,7 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double
 ; CHECK-NEXT:    zip2 v1.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec)
+  %retval = call {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double> %vec)
   ret {<2 x double>, <2 x double>}   %retval
 }
 
@@ -111,7 +111,7 @@ define {<16 x i8>, <16 x i8>} @vector_deinterleave_v16i8_v32i8(<32 x i8> %vec) {
 ; CHECK-NEXT:    uzp2 v1.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> %vec)
+  %retval = call {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8> %vec)
   ret {<16 x i8>, <16 x i8>}   %retval
 }
 
@@ -122,7 +122,7 @@ define {<8 x i16>, <8 x i16>} @vector_deinterleave_v8i16_v16i16(<16 x i16> %vec)
 ; CHECK-NEXT:    uzp2 v1.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %vec)
+  %retval = call {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16> %vec)
   ret {<8 x i16>, <8 x i16>}   %retval
 }
 
@@ -133,7 +133,7 @@ define {<4 x i32>, <4 x i32>} @vector_deinterleave_v4i32_v8i32(<8 x i32> %vec) {
 ; CHECK-NEXT:    uzp2 v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %vec)
+  %retval = call {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v8i32(<8 x i32> %vec)
   ret {<4 x i32>, <4 x i32>}   %retval
 }
 
@@ -144,22 +144,22 @@ define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) {
 ; CHECK-NEXT:    zip2 v1.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %vec)
+  %retval = call {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64> %vec)
   ret {<2 x i64>, <2 x i64>}   %retval
 }
 
 
 ; Floating declarations
-declare {<2 x half>,<2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>)
-declare {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half>)
-declare {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float>)
-declare {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half>)
-declare {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float>)
-declare {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double>)
+declare {<2 x half>,<2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half>)
+declare {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half>)
+declare {<2 x float>, <2 x float>} @llvm.vector.deinterleave2.v4f32(<4 x float>)
+declare {<8 x half>, <8 x half>} @llvm.vector.deinterleave2.v16f16(<16 x half>)
+declare {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float>)
+declare {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double>)
 
 ; Integer declarations
-declare {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8>)
-declare {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16>)
-declare {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>)
-declare {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64>)
+declare {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8>)
+declare {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16>)
+declare {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v8i32(<8 x i32>)
+declare {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64>)
 
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
index eb81aff33e49..2e992964f598 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
@@ -7,7 +7,7 @@ define <4 x half> @interleave2_v4f16(<2 x half> %vec0, <2 x half> %vec1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    zip1 v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    ret
-  %retval = call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %vec0, <2 x half> %vec1)
+  %retval = call <4 x half> @llvm.vector.interleave2.v4f16(<2 x half> %vec0, <2 x half> %vec1)
   ret <4 x half> %retval
 }
 
@@ -28,7 +28,7 @@ define <8 x half> @interleave2_v8f16(<4 x half> %vec0, <4 x half> %vec1) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    zip1 v0.8h, v0.8h, v1.8h
 ; CHECK-GI-NEXT:    ret
-  %retval = call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %vec0, <4 x half> %vec1)
+  %retval = call <8 x half> @llvm.vector.interleave2.v8f16(<4 x half> %vec0, <4 x half> %vec1)
   ret <8 x half> %retval
 }
 
@@ -39,7 +39,7 @@ define <16 x half> @interleave2_v16f16(<8 x half> %vec0, <8 x half> %vec1) {
 ; CHECK-NEXT:    zip2 v1.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %vec0, <8 x half> %vec1)
+  %retval = call <16 x half> @llvm.vector.interleave2.v16f16(<8 x half> %vec0, <8 x half> %vec1)
   ret <16 x half> %retval
 }
 
@@ -59,7 +59,7 @@ define <4 x float> @interleave2_v4f32(<2 x float> %vec0, <2 x float> %vec1) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    zip1 v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    ret
-  %retval = call <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float> %vec0, <2 x float> %vec1)
+  %retval = call <4 x float> @llvm.vector.interleave2.v4f32(<2 x float> %vec0, <2 x float> %vec1)
   ret <4 x float> %retval
 }
 
@@ -70,7 +70,7 @@ define <8 x float> @interleave2_v8f32(<4 x float> %vec0, <4 x float> %vec1) {
 ; CHECK-NEXT:    zip2 v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %vec0, <4 x float> %vec1)
+  %retval = call <8 x float> @llvm.vector.interleave2.v8f32(<4 x float> %vec0, <4 x float> %vec1)
   ret <8 x float> %retval
 }
 
@@ -81,7 +81,7 @@ define <4 x double> @interleave2_v4f64(<2 x double> %vec0, <2 x double> %vec1) {
 ; CHECK-NEXT:    zip2 v1.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call <4 x double>@llvm.experimental.vector.interleave2.v4f64(<2 x double> %vec0, <2 x double> %vec1)
+  %retval = call <4 x double>@llvm.vector.interleave2.v4f64(<2 x double> %vec0, <2 x double> %vec1)
   ret <4 x double> %retval
 }
 
@@ -94,7 +94,7 @@ define <32 x i8> @interleave2_v32i8(<16 x i8> %vec0, <16 x i8> %vec1) {
 ; CHECK-NEXT:    zip2 v1.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
- %retval = call <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8> %vec0, <16 x i8> %vec1)
+ %retval = call <32 x i8> @llvm.vector.interleave2.v32i8(<16 x i8> %vec0, <16 x i8> %vec1)
   ret <32 x i8> %retval
 }
 
@@ -105,7 +105,7 @@ define <16 x i16> @interleave2_v16i16(<8 x i16> %vec0, <8 x i16> %vec1) {
 ; CHECK-NEXT:    zip2 v1.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %vec0, <8 x i16> %vec1)
+  %retval = call <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16> %vec0, <8 x i16> %vec1)
   ret <16 x i16> %retval
 }
 
@@ -116,7 +116,7 @@ define <8 x i32> @interleave2_v8i32(<4 x i32> %vec0, <4 x i32> %vec1) {
 ; CHECK-NEXT:    zip2 v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %vec0, <4 x i32> %vec1)
+  %retval = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %vec0, <4 x i32> %vec1)
   ret <8 x i32> %retval
 }
 
@@ -127,22 +127,22 @@ define <4 x i64> @interleave2_v4i64(<2 x i64> %vec0, <2 x i64> %vec1) {
 ; CHECK-NEXT:    zip2 v1.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %vec0, <2 x i64> %vec1)
+  %retval = call <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64> %vec0, <2 x i64> %vec1)
   ret <4 x i64> %retval
 }
 
 
 ; Float declarations
-declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>)
-declare <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half>, <4 x half>)
-declare <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half>, <8 x half>)
-declare <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float>, <2 x float>)
-declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>)
-declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>)
+declare <4 x half> @llvm.vector.interleave2.v4f16(<2 x half>, <2 x half>)
+declare <8 x half> @llvm.vector.interleave2.v8f16(<4 x half>, <4 x half>)
+declare <16 x half> @llvm.vector.interleave2.v16f16(<8 x half>, <8 x half>)
+declare <4 x float> @llvm.vector.interleave2.v4f32(<2 x float>, <2 x float>)
+declare <8 x float> @llvm.vector.interleave2.v8f32(<4 x float>, <4 x float>)
+declare <4 x double> @llvm.vector.interleave2.v4f64(<2 x double>, <2 x double>)
 
 ; Integer declarations
-declare <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8>, <16 x i8>)
-declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
-declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
-declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
+declare <32 x i8> @llvm.vector.interleave2.v32i8(<16 x i8>, <16 x i8>)
+declare <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
+declare <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
 
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-subadd2.mir b/llvm/test/CodeGen/AArch64/machine-combiner-subadd2.mir
index 0b09e8a4b5cd..9da0808345a0 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner-subadd2.mir
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-subadd2.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=aarch64-linux-gnu -run-pass machine-combiner -o - %s | FileCheck %s
+# RUN: llc -mtriple=aarch64-linux-gnu -run-pass machine-combiner -verify-machineinstrs -o - %s | FileCheck %s
 
 # The test cases in this file check following transformation if the right form
 # can reduce latency.
@@ -241,8 +241,8 @@ body:              |
 # Drop nowrap flags in SUB
 
 # CHECK-LABEL: name: test8
-# CHECK:       %7:gpr64 = SUBXrr %1, %0
-# CHECK-NEXT:  %4:gpr64common = SUBXrr killed %7, killed %2
+# CHECK:       [[SUBXrr:%[0-9]+]]:gpr64 = SUBXrr %1, %0
+# CHECK-NEXT:  %4:gpr64common = SUBXrr killed [[SUBXrr]], killed %2
 
 name:            test8
 registers:
diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-neon.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-neon.ll
index 0eee19ad2adb..cff7759c72c9 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-neon.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-neon.ll
@@ -15,7 +15,7 @@ define <16 x i8> @reverse_v16i8(<16 x i8> %a) #0 {
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ret
 
-  %res = call <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8> %a)
+  %res = call <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8> %a)
   ret <16 x i8> %res
 }
 
@@ -26,7 +26,7 @@ define <8 x i16> @reverse_v8i16(<8 x i16> %a) #0 {
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ret
 
-  %res = call <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16> %a)
+  %res = call <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16> %a)
   ret <8 x i16> %res
 }
 
@@ -35,7 +35,7 @@ define <2 x i16> @reverse_v2i16(<2 x i16> %a) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-NEXT:    ret
-  %res = call <2 x i16> @llvm.experimental.vector.reverse.v2i16(<2 x i16> %a)
+  %res = call <2 x i16> @llvm.vector.reverse.v2i16(<2 x i16> %a)
   ret <2 x i16> %res
 }
 
@@ -44,7 +44,7 @@ define <2 x i32> @reverse_v2i32(<2 x i32> %a) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-NEXT:    ret
-  %res = call <2 x i32> @llvm.experimental.vector.reverse.v2i32(<2 x i32> %a)
+  %res = call <2 x i32> @llvm.vector.reverse.v2i32(<2 x i32> %a)
   ret <2 x i32> %res
 }
 
@@ -55,7 +55,7 @@ define <4 x i32> @reverse_v4i32(<4 x i32> %a) #0 {
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ret
 
-  %res = call <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32> %a)
+  %res = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> %a)
   ret <4 x i32> %res
 }
 
@@ -65,7 +65,7 @@ define <2 x i64> @reverse_v2i64(<2 x i64> %a) #0 {
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ret
 
-  %res = call <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64> %a)
+  %res = call <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64> %a)
   ret <2 x i64> %res
 }
 
@@ -76,7 +76,7 @@ define <8 x half> @reverse_v8f16(<8 x half> %a) #0 {
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ret
 
-  %res = call <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half> %a)
+  %res = call <8 x half> @llvm.vector.reverse.v8f16(<8 x half> %a)
   ret <8 x half> %res
 }
 
@@ -85,7 +85,7 @@ define <2 x float> @reverse_v2f32(<2 x float> %a) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-NEXT:    ret
-  %res = call <2 x float> @llvm.experimental.vector.reverse.v2f32(<2 x float> %a)
+  %res = call <2 x float> @llvm.vector.reverse.v2f32(<2 x float> %a)
   ret <2 x float> %res
 }
 
@@ -96,7 +96,7 @@ define <4 x float> @reverse_v4f32(<4 x float> %a) #0 {
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ret
 
-  %res = call <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float> %a)
+  %res = call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> %a)
   ret <4 x float> %res
 }
 
@@ -106,7 +106,7 @@ define <2 x double> @reverse_v2f64(<2 x double> %a) #0 {
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ret
 
-  %res = call <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double> %a)
+  %res = call <2 x double> @llvm.vector.reverse.v2f64(<2 x double> %a)
   ret <2 x double> %res
 }
 
@@ -117,7 +117,7 @@ define <2 x i8> @reverse_v2i8(<2 x i8> %a) #0 {
 ; CHECK-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-NEXT:    ret
 
-  %res = call <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8> %a)
+  %res = call <2 x i8> @llvm.vector.reverse.v2i8(<2 x i8> %a)
   ret <2 x i8> %res
 }
 
@@ -144,7 +144,7 @@ define <8 x i32> @reverse_v8i32(<8 x i32> %a) #0 {
 ; CHECK-FASTISEL-NEXT:    add sp, sp, #16
 ; CHECK-FASTISEL-NEXT:    ret
 
-  %res = call <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32> %a)
+  %res = call <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32> %a)
   ret <8 x i32> %res
 }
 
@@ -182,23 +182,23 @@ define <16 x float> @reverse_v16f32(<16 x float> %a) #0 {
 ; CHECK-FASTISEL-NEXT:    add sp, sp, #32
 ; CHECK-FASTISEL-NEXT:    ret
 
-  %res = call <16 x float> @llvm.experimental.vector.reverse.v16f32(<16 x float> %a)
+  %res = call <16 x float> @llvm.vector.reverse.v16f32(<16 x float> %a)
   ret <16 x float> %res
 }
 
 
-declare <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8>)
-declare <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8>)
-declare <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16>)
-declare <2 x i16> @llvm.experimental.vector.reverse.v2i16(<2 x i16>)
-declare <2 x i32> @llvm.experimental.vector.reverse.v2i32(<2 x i32>)
-declare <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32>)
-declare <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32>)
-declare <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64>)
-declare <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half>)
-declare <2 x float> @llvm.experimental.vector.reverse.v2f32(<2 x float>)
-declare <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float>)
-declare <16 x float> @llvm.experimental.vector.reverse.v16f32(<16 x float>)
-declare <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double>)
+declare <2 x i8> @llvm.vector.reverse.v2i8(<2 x i8>)
+declare <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8>)
+declare <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16>)
+declare <2 x i16> @llvm.vector.reverse.v2i16(<2 x i16>)
+declare <2 x i32> @llvm.vector.reverse.v2i32(<2 x i32>)
+declare <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32>)
+declare <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32>)
+declare <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64>)
+declare <8 x half> @llvm.vector.reverse.v8f16(<8 x half>)
+declare <2 x float> @llvm.vector.reverse.v2f32(<2 x float>)
+declare <4 x float> @llvm.vector.reverse.v4f32(<4 x float>)
+declare <16 x float> @llvm.vector.reverse.v16f32(<16 x float>)
+declare <2 x double> @llvm.vector.reverse.v2f64(<2 x double>)
 
 attributes #0 = { nounwind "target-features"="+neon" }
diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-sve.ll
index 4d5045feca08..a84e6e7bcae8 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-sve.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-sve.ll
@@ -14,7 +14,7 @@ define <vscale x 2 x i1> @reverse_nxv2i1(<vscale x 2 x i1> %a) #0 {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %a)
+  %res = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %a)
   ret <vscale x 2 x i1> %res
 }
 
@@ -24,7 +24,7 @@ define <vscale x 4 x i1> @reverse_nxv4i1(<vscale x 4 x i1> %a) #0 {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %res = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
   ret <vscale x 4 x i1> %res
 }
 
@@ -34,7 +34,7 @@ define <vscale x 8 x i1> @reverse_nxv8i1(<vscale x 8 x i1> %a) #0 {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %a)
+  %res = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %a)
   ret <vscale x 8 x i1> %res
 }
 
@@ -44,7 +44,7 @@ define <vscale x 16 x i1> @reverse_nxv16i1(<vscale x 16 x i1> %a) #0 {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %a)
+  %res = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %a)
   ret <vscale x 16 x i1> %res
 }
 
@@ -70,7 +70,7 @@ define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) #0 {
 ; CHECK-FASTISEL-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-FASTISEL-NEXT:    ret
 
-  %res = call <vscale x 32 x i1> @llvm.experimental.vector.reverse.nxv32i1(<vscale x 32 x i1> %a)
+  %res = call <vscale x 32 x i1> @llvm.vector.reverse.nxv32i1(<vscale x 32 x i1> %a)
   ret <vscale x 32 x i1> %res
 }
 
@@ -84,7 +84,7 @@ define <vscale x 16 x i8> @reverse_nxv16i8(<vscale x 16 x i8> %a) #0 {
 ; CHECK-NEXT:    rev z0.b, z0.b
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> %a)
+  %res = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> %a)
   ret <vscale x 16 x i8> %res
 }
 
@@ -94,7 +94,7 @@ define <vscale x 8 x i16> @reverse_nxv8i16(<vscale x 8 x i16> %a) #0 {
 ; CHECK-NEXT:    rev z0.h, z0.h
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> %a)
+  %res = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> %a)
   ret <vscale x 8 x i16> %res
 }
 
@@ -104,7 +104,7 @@ define <vscale x 4 x i32> @reverse_nxv4i32(<vscale x 4 x i32> %a) #0 {
 ; CHECK-NEXT:    rev z0.s, z0.s
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %res = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   ret <vscale x 4 x i32> %res
 }
 
@@ -114,7 +114,7 @@ define <vscale x 2 x i64> @reverse_nxv2i64(<vscale x 2 x i64> %a) #0 {
 ; CHECK-NEXT:    rev z0.d, z0.d
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> %a)
+  %res = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> %a)
   ret <vscale x 2 x i64> %res
 }
 
@@ -124,7 +124,7 @@ define <vscale x 2 x half> @reverse_nxv2f16(<vscale x 2 x half> %a) #0 {
 ; CHECK-NEXT:    rev z0.d, z0.d
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.reverse.nxv2f16(<vscale x 2 x half> %a)
+  %res = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> %a)
   ret <vscale x 2 x half> %res
 }
 
@@ -134,7 +134,7 @@ define <vscale x 4 x half> @reverse_nxv4f16(<vscale x 4 x half> %a) #0 {
 ; CHECK-NEXT:    rev z0.s, z0.s
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.reverse.nxv4f16(<vscale x 4 x half> %a)
+  %res = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> %a)
   ret <vscale x 4 x half> %res
 }
 
@@ -144,7 +144,7 @@ define <vscale x 8 x half> @reverse_nxv8f16(<vscale x 8 x half> %a) #0 {
 ; CHECK-NEXT:    rev z0.h, z0.h
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half> %a)
+  %res = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> %a)
   ret <vscale x 8 x half> %res
 }
 
@@ -154,7 +154,7 @@ define <vscale x 2 x bfloat> @reverse_nxv2bf16(<vscale x 2 x bfloat> %a) #1 {
 ; CHECK-NEXT:    rev z0.d, z0.d
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 2 x bfloat> @llvm.experimental.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> %a)
+  %res = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> %a)
   ret <vscale x 2 x bfloat> %res
 }
 
@@ -164,7 +164,7 @@ define <vscale x 4 x bfloat> @reverse_nxv4bf16(<vscale x 4 x bfloat> %a) #1 {
 ; CHECK-NEXT:    rev z0.s, z0.s
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 4 x bfloat> @llvm.experimental.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> %a)
+  %res = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> %a)
   ret <vscale x 4 x bfloat> %res
 }
 
@@ -174,7 +174,7 @@ define <vscale x 8 x bfloat> @reverse_nxv8bf16(<vscale x 8 x bfloat> %a) #1 {
 ; CHECK-NEXT:    rev z0.h, z0.h
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 8 x bfloat> @llvm.experimental.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> %a)
+  %res = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> %a)
   ret <vscale x 8 x bfloat> %res
 }
 
@@ -184,7 +184,7 @@ define <vscale x 2 x float> @reverse_nxv2f32(<vscale x 2 x float> %a) #0 {
 ; CHECK-NEXT:    rev z0.d, z0.d
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.reverse.nxv2f32(<vscale x 2 x float> %a)  ret <vscale x 2 x float> %res
+  %res = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> %a)  ret <vscale x 2 x float> %res
 }
 
 define <vscale x 4 x float> @reverse_nxv4f32(<vscale x 4 x float> %a) #0 {
@@ -193,7 +193,7 @@ define <vscale x 4 x float> @reverse_nxv4f32(<vscale x 4 x float> %a) #0 {
 ; CHECK-NEXT:    rev z0.s, z0.s
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)  ret <vscale x 4 x float> %res
+  %res = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)  ret <vscale x 4 x float> %res
 }
 
 define <vscale x 2 x double> @reverse_nxv2f64(<vscale x 2 x double> %a) #0 {
@@ -202,7 +202,7 @@ define <vscale x 2 x double> @reverse_nxv2f64(<vscale x 2 x double> %a) #0 {
 ; CHECK-NEXT:    rev z0.d, z0.d
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> %a)
+  %res = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> %a)
   ret <vscale x 2 x double> %res
 }
 
@@ -213,7 +213,7 @@ define <vscale x 2 x i8> @reverse_nxv2i8(<vscale x 2 x i8> %a) #0 {
 ; CHECK-NEXT:    rev z0.d, z0.d
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 2 x i8> @llvm.experimental.vector.reverse.nxv2i8(<vscale x 2 x i8> %a)
+  %res = call <vscale x 2 x i8> @llvm.vector.reverse.nxv2i8(<vscale x 2 x i8> %a)
   ret <vscale x 2 x i8> %res
 }
 
@@ -239,7 +239,7 @@ define <vscale x 8 x i32> @reverse_nxv8i32(<vscale x 8 x i32> %a) #0 {
 ; CHECK-FASTISEL-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-FASTISEL-NEXT:    ret
 
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> %a)
+  %res = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> %a)
   ret <vscale x 8 x i32> %res
 }
 
@@ -273,32 +273,32 @@ define <vscale x 16 x float> @reverse_nxv16f32(<vscale x 16 x float> %a) #0 {
 ; CHECK-FASTISEL-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-FASTISEL-NEXT:    ret
 
-  %res = call <vscale x 16 x float> @llvm.experimental.vector.reverse.nxv16f32(<vscale x 16 x float> %a)
+  %res = call <vscale x 16 x float> @llvm.vector.reverse.nxv16f32(<vscale x 16 x float> %a)
   ret <vscale x 16 x float> %res
 }
 
 
-declare <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1>)
-declare <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1>)
-declare <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1>)
-declare <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1>)
-declare <vscale x 32 x i1> @llvm.experimental.vector.reverse.nxv32i1(<vscale x 32 x i1>)
-declare <vscale x 2 x i8> @llvm.experimental.vector.reverse.nxv2i8(<vscale x 2 x i8>)
-declare <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8>)
-declare <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32>)
-declare <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64>)
-declare <vscale x 2 x half> @llvm.experimental.vector.reverse.nxv2f16(<vscale x 2 x half>)
-declare <vscale x 4 x half> @llvm.experimental.vector.reverse.nxv4f16(<vscale x 4 x half>)
-declare <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half>)
-declare <vscale x 2 x bfloat> @llvm.experimental.vector.reverse.nxv2bf16(<vscale x 2 x bfloat>)
-declare <vscale x 4 x bfloat> @llvm.experimental.vector.reverse.nxv4bf16(<vscale x 4 x bfloat>)
-declare <vscale x 8 x bfloat> @llvm.experimental.vector.reverse.nxv8bf16(<vscale x 8 x bfloat>)
-declare <vscale x 2 x float> @llvm.experimental.vector.reverse.nxv2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float>)
-declare <vscale x 16 x float> @llvm.experimental.vector.reverse.nxv16f32(<vscale x 16 x float>)
-declare <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1>)
+declare <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1>)
+declare <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1>)
+declare <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1>)
+declare <vscale x 32 x i1> @llvm.vector.reverse.nxv32i1(<vscale x 32 x i1>)
+declare <vscale x 2 x i8> @llvm.vector.reverse.nxv2i8(<vscale x 2 x i8>)
+declare <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32>)
+declare <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half>)
+declare <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half>)
+declare <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half>)
+declare <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat>)
+declare <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float>)
+declare <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 16 x float> @llvm.vector.reverse.nxv16f32(<vscale x 16 x float>)
+declare <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double>)
 
 
 attributes #0 = { nounwind "target-features"="+sve" }
diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll
index 9210a5ec1c8b..f2e62bc4f3c8 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll
@@ -12,7 +12,7 @@ define <16 x i8> @splice_v16i8_idx(<16 x i8> %a, <16 x i8> %b) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #1
 ; CHECK-NEXT:    ret
-  %res = call <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
+  %res = call <16 x i8> @llvm.vector.splice.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
   ret <16 x i8> %res
 }
 
@@ -21,7 +21,7 @@ define <2 x double> @splice_v2f64_idx(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
 ; CHECK-NEXT:    ret
-  %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 1)
+  %res = call <2 x double> @llvm.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 1)
   ret <2 x double> %res
 }
 
@@ -31,7 +31,7 @@ define <2 x i8> @splice_v2i8_idx(<2 x i8> %a, <2 x i8> %b) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
 ; CHECK-NEXT:    ret
-  %res = call <2 x i8> @llvm.experimental.vector.splice.v2i8(<2 x i8> %a, <2 x i8> %b, i32 1)
+  %res = call <2 x i8> @llvm.vector.splice.v2i8(<2 x i8> %a, <2 x i8> %b, i32 1)
   ret <2 x i8> %res
 }
 
@@ -42,7 +42,7 @@ define <8 x i32> @splice_v8i32_idx(<8 x i32> %a, <8 x i32> %b) #0 {
 ; CHECK-NEXT:    ext v0.16b, v1.16b, v2.16b, #4
 ; CHECK-NEXT:    ext v1.16b, v2.16b, v3.16b, #4
 ; CHECK-NEXT:    ret
-  %res = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> %a, <8 x i32> %b, i32 5)
+  %res = call <8 x i32> @llvm.vector.splice.v8i32(<8 x i32> %a, <8 x i32> %b, i32 5)
   ret <8 x i32> %res
 }
 
@@ -56,7 +56,7 @@ define <16 x float> @splice_v16f32_idx(<16 x float> %a, <16 x float> %b) #0 {
 ; CHECK-NEXT:    ext v3.16b, v4.16b, v5.16b, #12
 ; CHECK-NEXT:    mov v2.16b, v6.16b
 ; CHECK-NEXT:    ret
-  %res = call <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 7)
+  %res = call <16 x float> @llvm.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 7)
   ret <16 x float> %res
 }
 
@@ -69,7 +69,7 @@ define <16 x i8> @splice_v16i8(<16 x i8> %a, <16 x i8> %b) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #1
 ; CHECK-NEXT:    ret
-  %res = call <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8> %a, <16 x i8> %b, i32 -15)
+  %res = call <16 x i8> @llvm.vector.splice.v16i8(<16 x i8> %a, <16 x i8> %b, i32 -15)
   ret <16 x i8> %res
 }
 
@@ -78,7 +78,7 @@ define <2 x double> @splice_v2f64(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
 ; CHECK-NEXT:    ret
-  %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 -1)
+  %res = call <2 x double> @llvm.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 -1)
   ret <2 x double> %res
 }
 
@@ -88,7 +88,7 @@ define <2 x i8> @splice_v2i8(<2 x i8> %a, <2 x i8> %b) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
 ; CHECK-NEXT:    ret
-  %res = call <2 x i8> @llvm.experimental.vector.splice.v2i8(<2 x i8> %a, <2 x i8> %b, i32 -1)
+  %res = call <2 x i8> @llvm.vector.splice.v2i8(<2 x i8> %a, <2 x i8> %b, i32 -1)
   ret <2 x i8> %res
 }
 
@@ -99,7 +99,7 @@ define <8 x i32> @splice_v8i32(<8 x i32> %a, <8 x i32> %b) #0 {
 ; CHECK-NEXT:    ext v0.16b, v1.16b, v2.16b, #4
 ; CHECK-NEXT:    ext v1.16b, v2.16b, v3.16b, #4
 ; CHECK-NEXT:    ret
-  %res = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> %a, <8 x i32> %b, i32 -3)
+  %res = call <8 x i32> @llvm.vector.splice.v8i32(<8 x i32> %a, <8 x i32> %b, i32 -3)
   ret <8 x i32> %res
 }
 
@@ -113,14 +113,14 @@ define <16 x float> @splice_v16f32(<16 x float> %a, <16 x float> %b) #0 {
 ; CHECK-NEXT:    ext v3.16b, v4.16b, v5.16b, #12
 ; CHECK-NEXT:    mov v2.16b, v6.16b
 ; CHECK-NEXT:    ret
-  %res = call <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 -9)
+  %res = call <16 x float> @llvm.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 -9)
   ret <16 x float> %res
 }
 
-declare <2 x i8> @llvm.experimental.vector.splice.v2i8(<2 x i8>, <2 x i8>, i32)
-declare <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8>, <16 x i8>, i32)
-declare <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32>, <8 x i32>, i32)
-declare <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float>, <16 x float>, i32)
-declare <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double>, <2 x double>, i32)
+declare <2 x i8> @llvm.vector.splice.v2i8(<2 x i8>, <2 x i8>, i32)
+declare <16 x i8> @llvm.vector.splice.v16i8(<16 x i8>, <16 x i8>, i32)
+declare <8 x i32> @llvm.vector.splice.v8i32(<8 x i32>, <8 x i32>, i32)
+declare <16 x float> @llvm.vector.splice.v16f32(<16 x float>, <16 x float>, i32)
+declare <2 x double> @llvm.vector.splice.v2f64(<2 x double>, <2 x double>, i32)
 
 attributes #0 = { nounwind "target-features"="+neon" }
diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
index fac96e07de54..f5763cd61033 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
@@ -11,7 +11,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_zero_idx(<vscale x 16 x i8> %a, <vscal
 ; CHECK-LABEL: splice_nxv16i8_zero_idx:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 0)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 0)
   ret <vscale x 16 x i8> %res
 }
 
@@ -20,7 +20,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_first_idx(<vscale x 16 x i8> %a, <vsca
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 1)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 1)
   ret <vscale x 16 x i8> %res
 }
 
@@ -29,7 +29,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_last_idx(<vscale x 16 x i8> %a, <vscal
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #255
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 255)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 255)
   ret <vscale x 16 x i8> %res
 }
 
@@ -38,7 +38,7 @@ define <vscale x 8 x i16> @splice_nxv8i16_first_idx(<vscale x 8 x i16> %a, <vsca
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 1)
+  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 1)
   ret <vscale x 8 x i16> %res
 }
 
@@ -47,7 +47,7 @@ define <vscale x 4 x i32> @splice_nxv4i32_first_idx(<vscale x 4 x i32> %a, <vsca
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 1)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 1)
   ret <vscale x 4 x i32> %res
 }
 
@@ -56,7 +56,7 @@ define <vscale x 4 x i32> @splice_nxv4i32_last_idx(<vscale x 4 x i32> %a, <vscal
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #252
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 63)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 63)
   ret <vscale x 4 x i32> %res
 }
 
@@ -65,7 +65,7 @@ define <vscale x 2 x i64> @splice_nxv2i64_first_idx(<vscale x 2 x i64> %a, <vsca
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 1)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 1)
   ret <vscale x 2 x i64> %res
 }
 
@@ -74,7 +74,7 @@ define <vscale x 2 x i64> @splice_nxv2i64_last_idx(<vscale x 2 x i64> %a, <vscal
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #248
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 31)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 31)
   ret <vscale x 2 x i64> %res
 }
 
@@ -85,7 +85,7 @@ define <vscale x 2 x half> @splice_nxv2f16_neg_idx(<vscale x 2 x half> %a, <vsca
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -1)
+  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -1)
   ret <vscale x 2 x half> %res
 }
 
@@ -96,7 +96,7 @@ define <vscale x 2 x half> @splice_nxv2f16_neg2_idx(<vscale x 2 x half> %a, <vsc
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -2)
+  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -2)
   ret <vscale x 2 x half> %res
 }
 
@@ -105,7 +105,7 @@ define <vscale x 2 x half> @splice_nxv2f16_first_idx(<vscale x 2 x half> %a, <vs
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 1)
+  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 1)
   ret <vscale x 2 x half> %res
 }
 
@@ -114,7 +114,7 @@ define <vscale x 2 x half> @splice_nxv2f16_last_idx(<vscale x 2 x half> %a, <vsc
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #248
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 31)
+  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 31)
   ret <vscale x 2 x half> %res
 }
 
@@ -125,7 +125,7 @@ define <vscale x 4 x half> @splice_nxv4f16_neg_idx(<vscale x 4 x half> %a, <vsca
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -1)
+  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -1)
   ret <vscale x 4 x half> %res
 }
 
@@ -136,7 +136,7 @@ define <vscale x 4 x half> @splice_nxv4f16_neg3_idx(<vscale x 4 x half> %a, <vsc
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -3)
+  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -3)
   ret <vscale x 4 x half> %res
 }
 
@@ -145,7 +145,7 @@ define <vscale x 4 x half> @splice_nxv4f16_first_idx(<vscale x 4 x half> %a, <vs
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 1)
+  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 1)
   ret <vscale x 4 x half> %res
 }
 
@@ -154,7 +154,7 @@ define <vscale x 4 x half> @splice_nxv4f16_last_idx(<vscale x 4 x half> %a, <vsc
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #252
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 63)
+  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 63)
   ret <vscale x 4 x half> %res
 }
 
@@ -163,7 +163,7 @@ define <vscale x 8 x half> @splice_nxv8f16_first_idx(<vscale x 8 x half> %a, <vs
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 1)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 1)
   ret <vscale x 8 x half> %res
 }
 
@@ -172,7 +172,7 @@ define <vscale x 8 x half> @splice_nxv8f16_last_idx(<vscale x 8 x half> %a, <vsc
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #254
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 127)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 127)
   ret <vscale x 8 x half> %res
 }
 
@@ -183,7 +183,7 @@ define <vscale x 2 x float> @splice_nxv2f32_neg_idx(<vscale x 2 x float> %a, <vs
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -1)
+  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -1)
   ret <vscale x 2 x float> %res
 }
 
@@ -194,7 +194,7 @@ define <vscale x 2 x float> @splice_nxv2f32_neg2_idx(<vscale x 2 x float> %a, <v
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -2)
+  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -2)
   ret <vscale x 2 x float> %res
 }
 
@@ -203,7 +203,7 @@ define <vscale x 2 x float> @splice_nxv2f32_first_idx(<vscale x 2 x float> %a, <
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 1)
+  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 1)
   ret <vscale x 2 x float> %res
 }
 
@@ -212,7 +212,7 @@ define <vscale x 2 x float> @splice_nxv2f32_last_idx(<vscale x 2 x float> %a, <v
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #248
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 31)
+  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 31)
   ret <vscale x 2 x float> %res
 }
 
@@ -221,7 +221,7 @@ define <vscale x 4 x float> @splice_nxv4f32_first_idx(<vscale x 4 x float> %a, <
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 1)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 1)
   ret <vscale x 4 x float> %res
 }
 
@@ -230,7 +230,7 @@ define <vscale x 4 x float> @splice_nxv4f32_last_idx(<vscale x 4 x float> %a, <v
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #252
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 63)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 63)
   ret <vscale x 4 x float> %res
 }
 
@@ -239,7 +239,7 @@ define <vscale x 2 x double> @splice_nxv2f64_first_idx(<vscale x 2 x double> %a,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 1)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 1)
   ret <vscale x 2 x double> %res
 }
 
@@ -248,7 +248,7 @@ define <vscale x 2 x double> @splice_nxv2f64_last_idx(<vscale x 2 x double> %a,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #248
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 31)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 31)
   ret <vscale x 2 x double> %res
 }
 
@@ -263,7 +263,7 @@ define <vscale x 2 x i1> @splice_nxv2i1_idx(<vscale x 2 x i1> %a, <vscale x 2 x
 ; CHECK-NEXT:    and z1.d, z1.d, #0x1
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 1)
+  %res = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -278,7 +278,7 @@ define <vscale x 4 x i1> @splice_nxv4i1_idx(<vscale x 4 x i1> %a, <vscale x 4 x
 ; CHECK-NEXT:    and z1.s, z1.s, #0x1
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 2)
+  %res = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 2)
   ret <vscale x 4 x i1> %res
 }
 
@@ -293,7 +293,7 @@ define <vscale x 8 x i1> @splice_nxv8i1_idx(<vscale x 8 x i1> %a, <vscale x 8 x
 ; CHECK-NEXT:    and z1.h, z1.h, #0x1
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z1.h, #0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 4)
+  %res = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 4)
   ret <vscale x 8 x i1> %res
 }
 
@@ -308,7 +308,7 @@ define <vscale x 16 x i1> @splice_nxv16i1_idx(<vscale x 16 x i1> %a, <vscale x 1
 ; CHECK-NEXT:    and z1.b, z1.b, #0x1
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 8)
+  %res = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 8)
   ret <vscale x 16 x i1> %res
 }
 
@@ -318,7 +318,7 @@ define <vscale x 2 x i8> @splice_nxv2i8_idx(<vscale x 2 x i8> %a, <vscale x 2 x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 1)
+  %res = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 1)
   ret <vscale x 2 x i8> %res
 }
 
@@ -340,7 +340,7 @@ define <vscale x 8 x i32> @splice_nxv8i32_idx(<vscale x 8 x i32> %a, <vscale x 8
 ; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 2)
+  %res = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 2)
   ret <vscale x 8 x i32> %res
 }
 
@@ -373,7 +373,7 @@ define <vscale x 16 x float> @splice_nxv16f32_16(<vscale x 16 x float> %a, <vsca
 ; CHECK-NEXT:    addvl sp, sp, #8
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 16)
+  %res = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 16)
   ret <vscale x 16 x float> %res
 }
 
@@ -388,7 +388,7 @@ define <vscale x 16 x i8> @splice_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -16)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -16)
   ret <vscale x 16 x i8> %res
 }
 
@@ -399,7 +399,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_neg32(<vscale x 16 x i8> %a, <vscale x
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -32)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -32)
   ret <vscale x 16 x i8> %res
 }
 
@@ -410,7 +410,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_neg64(<vscale x 16 x i8> %a, <vscale x
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -64)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -64)
   ret <vscale x 16 x i8> %res
 }
 
@@ -421,7 +421,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_neg128(<vscale x 16 x i8> %a, <vscale
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -128)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -128)
   ret <vscale x 16 x i8> %res
 }
 
@@ -432,7 +432,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_neg256(<vscale x 16 x i8> %a, <vscale
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -256)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -256)
   ret <vscale x 16 x i8> %res
 }
 
@@ -443,7 +443,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_1(<vscale x 16 x i8> %a, <vscale x 16
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -1)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -1)
   ret <vscale x 16 x i8> %res
 }
 
@@ -466,7 +466,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_neg17(<vscale x 16 x i8> %a, <vscale x
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -17)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -17)
   ret <vscale x 16 x i8> %res
 }
 
@@ -477,7 +477,7 @@ define <vscale x 8 x i16> @splice_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -8)
+  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -8)
   ret <vscale x 8 x i16> %res
 }
 
@@ -488,7 +488,7 @@ define <vscale x 8 x i16> @splice_nxv8i16_1(<vscale x 8 x i16> %a, <vscale x 8 x
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -1)
+  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -1)
   ret <vscale x 8 x i16> %res
 }
 
@@ -511,7 +511,7 @@ define <vscale x 8 x i16> @splice_nxv8i16_neg9(<vscale x 8 x i16> %a, <vscale x
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -9)
+  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -9)
   ret <vscale x 8 x i16> %res
 }
 
@@ -522,7 +522,7 @@ define <vscale x 4 x i32> @splice_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -4)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -4)
   ret <vscale x 4 x i32> %res
 }
 
@@ -533,7 +533,7 @@ define <vscale x 4 x i32> @splice_nxv4i32_1(<vscale x 4 x i32> %a, <vscale x 4 x
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -1)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -1)
   ret <vscale x 4 x i32> %res
 }
 
@@ -544,7 +544,7 @@ define <vscale x 4 x i32> @splice_nxv4i32_neg5(<vscale x 4 x i32> %a, <vscale x
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -5)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -5)
   ret <vscale x 4 x i32> %res
 }
 
@@ -555,7 +555,7 @@ define <vscale x 2 x i64> @splice_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -2)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -2)
   ret <vscale x 2 x i64> %res
 }
 
@@ -566,7 +566,7 @@ define <vscale x 2 x i64> @splice_nxv2i64_1(<vscale x 2 x i64> %a, <vscale x 2 x
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -1)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -1)
   ret <vscale x 2 x i64> %res
 }
 
@@ -577,7 +577,7 @@ define <vscale x 2 x i64> @splice_nxv2i64_neg3(<vscale x 2 x i64> %a, <vscale x
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -3)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -3)
   ret <vscale x 2 x i64> %res
 }
 
@@ -588,7 +588,7 @@ define <vscale x 8 x half> @splice_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -8)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -8)
   ret <vscale x 8 x half> %res
 }
 
@@ -599,7 +599,7 @@ define <vscale x 8 x half> @splice_nxv8f16_1(<vscale x 8 x half> %a, <vscale x 8
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -1)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -1)
   ret <vscale x 8 x half> %res
 }
 
@@ -622,7 +622,7 @@ define <vscale x 8 x half> @splice_nxv8f16_neg9(<vscale x 8 x half> %a, <vscale
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -9)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -9)
   ret <vscale x 8 x half> %res
 }
 
@@ -633,7 +633,7 @@ define <vscale x 4 x float> @splice_nxv4f32(<vscale x 4 x float> %a, <vscale x 4
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -4)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -4)
   ret <vscale x 4 x float> %res
 }
 
@@ -644,7 +644,7 @@ define <vscale x 4 x float> @splice_nxv4f32_1(<vscale x 4 x float> %a, <vscale x
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -1)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -1)
   ret <vscale x 4 x float> %res
 }
 
@@ -655,7 +655,7 @@ define <vscale x 4 x float> @splice_nxv4f32_neg5(<vscale x 4 x float> %a, <vscal
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -5)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -5)
   ret <vscale x 4 x float> %res
 }
 
@@ -666,7 +666,7 @@ define <vscale x 2 x double> @splice_nxv2f64(<vscale x 2 x double> %a, <vscale x
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -2)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -2)
   ret <vscale x 2 x double> %res
 }
 
@@ -677,7 +677,7 @@ define <vscale x 2 x double> @splice_nxv2f64_1(<vscale x 2 x double> %a, <vscale
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -1)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -1)
   ret <vscale x 2 x double> %res
 }
 
@@ -688,7 +688,7 @@ define <vscale x 2 x double> @splice_nxv2f64_neg3(<vscale x 2 x double> %a, <vsc
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -3)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -3)
   ret <vscale x 2 x double> %res
 }
 
@@ -705,7 +705,7 @@ define <vscale x 2 x i1> @splice_nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1>
 ; CHECK-NEXT:    and z1.d, z1.d, #0x1
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 -1)
+  %res = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 -1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -722,7 +722,7 @@ define <vscale x 4 x i1> @splice_nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1>
 ; CHECK-NEXT:    and z1.s, z1.s, #0x1
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 -1)
+  %res = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 -1)
   ret <vscale x 4 x i1> %res
 }
 
@@ -739,7 +739,7 @@ define <vscale x 8 x i1> @splice_nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1>
 ; CHECK-NEXT:    and z1.h, z1.h, #0x1
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z1.h, #0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 -1)
+  %res = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 -1)
   ret <vscale x 8 x i1> %res
 }
 
@@ -756,7 +756,7 @@ define <vscale x 16 x i1> @splice_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x
 ; CHECK-NEXT:    and z1.b, z1.b, #0x1
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 -1)
+  %res = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 -1)
   ret <vscale x 16 x i1> %res
 }
 
@@ -768,7 +768,7 @@ define <vscale x 2 x i8> @splice_nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8>
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 -2)
+  %res = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 -2)
   ret <vscale x 2 x i8> %res
 }
 
@@ -793,7 +793,7 @@ define <vscale x 8 x i32> @splice_nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i
 ; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 -8)
+  %res = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 -8)
   ret <vscale x 8 x i32> %res
 }
 
@@ -826,26 +826,26 @@ define <vscale x 16 x float> @splice_nxv16f32_neg17(<vscale x 16 x float> %a, <v
 ; CHECK-NEXT:    addvl sp, sp, #8
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 -17)
+  %res = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 -17)
   ret <vscale x 16 x float> %res
 }
 
-declare <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
-declare <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
-declare <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
-declare <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
-declare <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
-declare <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
-declare <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
-declare <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
-declare <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
-declare <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
-declare <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
-declare <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
-declare <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
-declare <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
-declare <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
-declare <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
-declare <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
+declare <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
+declare <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
+declare <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
+declare <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+declare <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
+declare <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
+declare <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
+declare <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
+declare <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
+declare <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
+declare <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
+declare <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
+declare <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
 
 attributes #0 = { nounwind "target-features"="+sve" }
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
index 9920bc6048e8..478f4a689d3c 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
@@ -9,7 +9,7 @@ define {<vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_n
 ; CHECK-NEXT:    uzp1 z0.d, z2.d, z1.d
 ; CHECK-NEXT:    uzp2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
+  %retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
   ret {<vscale x 2 x half>, <vscale x 2 x half>}   %retval
 }
 
@@ -21,7 +21,7 @@ define {<vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_n
 ; CHECK-NEXT:    uzp1 z0.s, z2.s, z1.s
 ; CHECK-NEXT:    uzp2 z1.s, z2.s, z1.s
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec)
+  %retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec)
   ret {<vscale x 4 x half>, <vscale x 4 x half>}   %retval
 }
 
@@ -32,7 +32,7 @@ define {<vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_n
 ; CHECK-NEXT:    uzp2 z1.h, z0.h, z1.h
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %vec)
+  %retval = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %vec)
   ret {<vscale x 8 x half>, <vscale x 8 x half>}   %retval
 }
 
@@ -44,7 +44,7 @@ define {<vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32
 ; CHECK-NEXT:    uzp1 z0.d, z2.d, z1.d
 ; CHECK-NEXT:    uzp2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec)
+  %retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec)
   ret {<vscale x 2 x float>, <vscale x 2 x float>}   %retval
 }
 
@@ -55,7 +55,7 @@ define {<vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_nxv4f32
 ; CHECK-NEXT:    uzp2 z1.s, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %vec)
+  %retval = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %vec)
 ret  {<vscale x 4 x float>, <vscale x 4 x float>}   %retval
 }
 
@@ -66,7 +66,7 @@ define {<vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f
 ; CHECK-NEXT:    uzp2 z1.d, z0.d, z1.d
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
+  %retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
   ret {<vscale x 2 x double>, <vscale x 2 x double>}   %retval
 }
 
@@ -79,7 +79,7 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv
 ; CHECK-NEXT:    uzp2 z1.b, z0.b, z1.b
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
+  %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
   ret {<vscale x 16 x i8>, <vscale x 16 x i8>}   %retval
 }
 
@@ -90,7 +90,7 @@ define {<vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv
 ; CHECK-NEXT:    uzp2 z1.h, z0.h, z1.h
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
+  %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
   ret {<vscale x 8 x i16>, <vscale x 8 x i16>}   %retval
 }
 
@@ -101,7 +101,7 @@ define {<vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxv
 ; CHECK-NEXT:    uzp2 z1.s, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec)
+  %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec)
   ret {<vscale x 4 x i32>, <vscale x 4 x i32>}   %retval
 }
 
@@ -112,7 +112,7 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv
 ; CHECK-NEXT:    uzp2 z1.d, z0.d, z1.d
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
+  %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
   ret {<vscale x 2 x i64>, <vscale x 2 x i64>}   %retval
 }
 
@@ -124,7 +124,7 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv
 ; CHECK-NEXT:    uzp2 p1.b, p0.b, p1.b
 ; CHECK-NEXT:    mov p0.b, p2.b
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.experimental.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
+  %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
   ret {<vscale x 16 x i1>, <vscale x 16 x i1>}   %retval
 }
 
@@ -136,7 +136,7 @@ define {<vscale x 8 x i1>, <vscale x 8 x i1>} @vector_deinterleave_nxv8i1_nxv16i
 ; CHECK-NEXT:    uzp1 p0.h, p2.h, p1.h
 ; CHECK-NEXT:    uzp2 p1.h, p2.h, p1.h
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 8 x i1>, <vscale x 8 x i1>} @llvm.experimental.vector.deinterleave2.nxv16i1(<vscale x 16 x i1> %vec)
+  %retval = call {<vscale x 8 x i1>, <vscale x 8 x i1>} @llvm.vector.deinterleave2.nxv16i1(<vscale x 16 x i1> %vec)
   ret {<vscale x 8 x i1>, <vscale x 8 x i1>}   %retval
 }
 
@@ -148,7 +148,7 @@ define {<vscale x 4 x i1>, <vscale x 4 x i1>} @vector_deinterleave_nxv4i1_nxv8i1
 ; CHECK-NEXT:    uzp1 p0.s, p2.s, p1.s
 ; CHECK-NEXT:    uzp2 p1.s, p2.s, p1.s
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 4 x i1>, <vscale x 4 x i1>} @llvm.experimental.vector.deinterleave2.nxv8i1(<vscale x 8 x i1> %vec)
+  %retval = call {<vscale x 4 x i1>, <vscale x 4 x i1>} @llvm.vector.deinterleave2.nxv8i1(<vscale x 8 x i1> %vec)
   ret {<vscale x 4 x i1>, <vscale x 4 x i1>}   %retval
 }
 
@@ -160,7 +160,7 @@ define {<vscale x 2 x i1>, <vscale x 2 x i1>} @vector_deinterleave_nxv2i1_nxv4i1
 ; CHECK-NEXT:    uzp1 p0.d, p2.d, p1.d
 ; CHECK-NEXT:    uzp2 p1.d, p2.d, p1.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 2 x i1>, <vscale x 2 x i1>} @llvm.experimental.vector.deinterleave2.nxv4i1(<vscale x 4 x i1> %vec)
+  %retval = call {<vscale x 2 x i1>, <vscale x 2 x i1>} @llvm.vector.deinterleave2.nxv4i1(<vscale x 4 x i1> %vec)
   ret {<vscale x 2 x i1>, <vscale x 2 x i1>}   %retval
 }
 
@@ -178,7 +178,7 @@ define {<vscale x 4 x i64>, <vscale x 4 x i64>} @vector_deinterleave_nxv4i64_nxv
 ; CHECK-NEXT:    mov z1.d, z4.d
 ; CHECK-NEXT:    mov z2.d, z6.d
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %vec)
+%retval = call {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %vec)
 ret {<vscale x 4 x i64>, <vscale x 4 x i64>} %retval
 }
 
@@ -201,7 +201,7 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>}  @vector_deinterleave_nxv8i64_nx
 ; CHECK-NEXT:    mov z5.d, z29.d
 ; CHECK-NEXT:    mov z6.d, z30.d
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.experimental.vector.deinterleave2.nxv16i64(<vscale x 16 x i64> %vec)
+%retval = call {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64> %vec)
 ret {<vscale x 8 x i64>, <vscale x 8 x i64>}  %retval
 }
 
@@ -216,7 +216,7 @@ define {<vscale x 8 x i8>, <vscale x 8 x i8>} @vector_deinterleave_nxv8i8_nxv16i
 ; CHECK-NEXT:    uzp1 z0.h, z2.h, z1.h
 ; CHECK-NEXT:    uzp2 z1.h, z2.h, z1.h
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.experimental.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %vec)
+%retval = call {<vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %vec)
 ret {<vscale x 8 x i8>, <vscale x 8 x i8>} %retval
 }
 
@@ -228,7 +228,7 @@ define {<vscale x 4 x i16>, <vscale x 4 x i16>} @vector_deinterleave_nxv4i16_nxv
 ; CHECK-NEXT:    uzp1 z0.s, z2.s, z1.s
 ; CHECK-NEXT:    uzp2 z1.s, z2.s, z1.s
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 4 x i16>, <vscale x 4 x i16>} @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %vec)
+%retval = call {<vscale x 4 x i16>, <vscale x 4 x i16>} @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %vec)
 ret {<vscale x 4 x i16>, <vscale x 4 x i16>} %retval
 }
 
@@ -240,35 +240,35 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @vector_deinterleave_nxv2i32_nxv
 ; CHECK-NEXT:    uzp1 z0.d, z2.d, z1.d
 ; CHECK-NEXT:    uzp2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 2 x i32>,<vscale x 2 x i32>} @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %vec)
+%retval = call {<vscale x 2 x i32>,<vscale x 2 x i32>} @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %vec)
 ret {<vscale x 2 x i32>, <vscale x 2 x i32>} %retval
 }
 
 
 ; Floating declarations
-declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
-declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
-declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
-declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
-declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
-declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
+declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
+declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
+declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
+declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
+declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
 
 ; Integer declarations
-declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
-declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
-declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
-declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
+declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
+declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
 
 ; Predicated declarations
-declare {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.experimental.vector.deinterleave2.nxv32i1(<vscale x 32 x i1>)
-declare {<vscale x 8 x i1>, <vscale x 8 x i1>} @llvm.experimental.vector.deinterleave2.nxv16i1(<vscale x 16 x i1>)
-declare {<vscale x 4 x i1>, <vscale x 4 x i1>} @llvm.experimental.vector.deinterleave2.nxv8i1(<vscale x 8 x i1>)
-declare {<vscale x 2 x i1>, <vscale x 2 x i1>} @llvm.experimental.vector.deinterleave2.nxv4i1(<vscale x 4 x i1>)
+declare {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1>)
+declare {<vscale x 8 x i1>, <vscale x 8 x i1>} @llvm.vector.deinterleave2.nxv16i1(<vscale x 16 x i1>)
+declare {<vscale x 4 x i1>, <vscale x 4 x i1>} @llvm.vector.deinterleave2.nxv8i1(<vscale x 8 x i1>)
+declare {<vscale x 2 x i1>, <vscale x 2 x i1>} @llvm.vector.deinterleave2.nxv4i1(<vscale x 4 x i1>)
 
 ; Illegal size type
-declare {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
-declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.experimental.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>)
+declare {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
+declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>)
 
-declare {<vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.experimental.vector.deinterleave2.nxv16i8(<vscale x 16 x i8>)
-declare {<vscale x 4 x i16>, <vscale x 4 x i16>} @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16>)
-declare {<vscale x 2 x i32>, <vscale x 2 x i32>} @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32>)
+declare {<vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8>)
+declare {<vscale x 4 x i16>, <vscale x 4 x i16>} @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16>)
+declare {<vscale x 2 x i32>, <vscale x 2 x i32>} @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
index 23bf5065286e..e2c3b0abe21a 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
@@ -8,7 +8,7 @@ define <vscale x 4 x half> @interleave2_nxv4f16(<vscale x 2 x half> %vec0, <vsca
 ; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half> %vec0, <vscale x 2 x half> %vec1)
+  %retval = call <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half> %vec0, <vscale x 2 x half> %vec1)
   ret  <vscale x 4 x half>   %retval
 }
 
@@ -19,7 +19,7 @@ define <vscale x 8 x half> @interleave2_nxv8f16(<vscale x 4 x half> %vec0, <vsca
 ; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half> %vec0, <vscale x 4 x half> %vec1)
+  %retval = call <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half> %vec0, <vscale x 4 x half> %vec1)
   ret  <vscale x 8 x half>   %retval
 }
 
@@ -30,7 +30,7 @@ define <vscale x 16 x half> @interleave2_nxv16f16(<vscale x 8 x half> %vec0, <vs
 ; CHECK-NEXT:    zip2 z1.h, z0.h, z1.h
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half> %vec0, <vscale x 8 x half> %vec1)
+  %retval = call <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half> %vec0, <vscale x 8 x half> %vec1)
   ret  <vscale x 16 x half>   %retval
 }
 
@@ -41,7 +41,7 @@ define <vscale x 4 x float> @interleave2_nxv4f32(<vscale x 2 x float> %vec0, <vs
 ; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float> %vec0, <vscale x 2 x float> %vec1)
+  %retval = call <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float> %vec0, <vscale x 2 x float> %vec1)
   ret  <vscale x 4 x float>   %retval
 }
 
@@ -52,7 +52,7 @@ define <vscale x 8 x float> @interleave2_nxv8f32(<vscale x 4 x float> %vec0, <vs
 ; CHECK-NEXT:    zip2 z1.s, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1)
+  %retval = call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1)
   ret  <vscale x 8 x float>   %retval
 }
 
@@ -63,7 +63,7 @@ define <vscale x 4 x double> @interleave2_nxv4f64(<vscale x 2 x double> %vec0, <
 ; CHECK-NEXT:    zip2 z1.d, z0.d, z1.d
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 4 x double>@llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %vec0, <vscale x 2 x double> %vec1)
+  %retval = call <vscale x 4 x double>@llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %vec0, <vscale x 2 x double> %vec1)
   ret  <vscale x 4 x double>   %retval
 }
 
@@ -76,7 +76,7 @@ define <vscale x 32 x i8> @interleave2_nxv32i8(<vscale x 16 x i8> %vec0, <vscale
 ; CHECK-NEXT:    zip2 z1.b, z0.b, z1.b
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-   %retval = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1)
+   %retval = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1)
   ret  <vscale x 32 x i8>   %retval
 }
 
@@ -87,7 +87,7 @@ define <vscale x 16 x i16> @interleave2_nxv16i16(<vscale x 8 x i16> %vec0, <vsca
 ; CHECK-NEXT:    zip2 z1.h, z0.h, z1.h
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1)
+  %retval = call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1)
   ret  <vscale x 16 x i16>   %retval
 }
 
@@ -98,7 +98,7 @@ define <vscale x 8 x i32> @interleave2_nxv8i32(<vscale x 4 x i32> %vec0, <vscale
 ; CHECK-NEXT:    zip2 z1.s, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1)
+  %retval = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1)
   ret  <vscale x 8 x i32>   %retval
 }
 
@@ -109,7 +109,7 @@ define <vscale x 4 x i64> @interleave2_nxv4i64(<vscale x 2 x i64> %vec0, <vscale
 ; CHECK-NEXT:    zip2 z1.d, z0.d, z1.d
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1)
+  %retval = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1)
   ret  <vscale x 4 x i64>   %retval
 }
 
@@ -122,7 +122,7 @@ define <vscale x 32 x i1> @interleave2_nxv32i1(<vscale x 16 x i1> %vec0, <vscale
 ; CHECK-NEXT:    zip2 p1.b, p0.b, p1.b
 ; CHECK-NEXT:    mov p0.b, p2.b
 ; CHECK-NEXT:    ret
-   %retval = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> %vec0, <vscale x 16 x i1> %vec1)
+   %retval = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %vec0, <vscale x 16 x i1> %vec1)
   ret  <vscale x 32 x i1>   %retval
 }
 
@@ -133,7 +133,7 @@ define <vscale x 16 x i1> @interleave2_nxv16i1(<vscale x 8 x i1> %vec0, <vscale
 ; CHECK-NEXT:    zip1 p0.h, p0.h, p1.h
 ; CHECK-NEXT:    uzp1 p0.b, p0.b, p2.b
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 16 x i1> @llvm.experimental.vector.interleave2.nxv16i1(<vscale x 8 x i1> %vec0, <vscale x 8 x i1> %vec1)
+  %retval = call <vscale x 16 x i1> @llvm.vector.interleave2.nxv16i1(<vscale x 8 x i1> %vec0, <vscale x 8 x i1> %vec1)
   ret  <vscale x 16 x i1>   %retval
 }
 
@@ -144,7 +144,7 @@ define <vscale x 8 x i1> @interleave2_nxv8i1(<vscale x 4 x i1> %vec0, <vscale x
 ; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
 ; CHECK-NEXT:    uzp1 p0.h, p0.h, p2.h
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 8 x i1> @llvm.experimental.vector.interleave2.nxv8i1(<vscale x 4 x i1> %vec0, <vscale x 4 x i1> %vec1)
+  %retval = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> %vec0, <vscale x 4 x i1> %vec1)
   ret  <vscale x 8 x i1>   %retval
 }
 
@@ -155,7 +155,7 @@ define <vscale x 4 x i1> @interleave2_nxv4i1(<vscale x 2 x i1> %vec0, <vscale x
 ; CHECK-NEXT:    zip1 p0.d, p0.d, p1.d
 ; CHECK-NEXT:    uzp1 p0.s, p0.s, p2.s
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1> %vec0, <vscale x 2 x i1> %vec1)
+  %retval = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %vec0, <vscale x 2 x i1> %vec1)
   ret  <vscale x 4 x i1>   %retval
 }
 
@@ -172,7 +172,7 @@ define <vscale x 16 x i32> @interleave2_nxv16i32(<vscale x 8 x i32> %vec0, <vsca
 ; CHECK-NEXT:    mov z1.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z4.d
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 16 x i32>@llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %vec0, <vscale x 8 x i32> %vec1)
+  %retval = call <vscale x 16 x i32>@llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %vec0, <vscale x 8 x i32> %vec1)
   ret <vscale x 16 x i32> %retval
 }
 
@@ -187,7 +187,7 @@ define <vscale x 8 x i64> @interleave2_nxv8i64(<vscale x 4 x i64> %vec0, <vscale
 ; CHECK-NEXT:    mov z1.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z4.d
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64> %vec0, <vscale x 4 x i64> %vec1)
+  %retval = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> %vec0, <vscale x 4 x i64> %vec1)
   ret <vscale x 8 x i64> %retval
 }
 
@@ -200,7 +200,7 @@ define <vscale x 16 x i8> @interleave2_nxv8i8(<vscale x 8 x i8> %vec0, <vscale x
 ; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z2.b
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 16 x i8> @llvm.experimental.vector.interleave2.nxv16i8(<vscale x 8 x i8> %vec0, <vscale x 8 x i8> %vec1)
+  %retval = call <vscale x 16 x i8> @llvm.vector.interleave2.nxv16i8(<vscale x 8 x i8> %vec0, <vscale x 8 x i8> %vec1)
   ret <vscale x 16 x i8> %retval
 }
 
@@ -211,7 +211,7 @@ define <vscale x 8 x i16> @interleave2_nxv4i16(<vscale x 4 x i16> %vec0, <vscale
 ; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 8 x i16> @llvm.experimental.vector.interleave2.nxv8i16(<vscale x 4 x i16> %vec0, <vscale x 4 x i16> %vec1)
+  %retval = call <vscale x 8 x i16> @llvm.vector.interleave2.nxv8i16(<vscale x 4 x i16> %vec0, <vscale x 4 x i16> %vec1)
   ret <vscale x 8 x i16> %retval
 }
 
@@ -222,34 +222,34 @@ define <vscale x 4 x i32> @interleave2_nxv2i32(<vscale x 2 x i32> %vec0, <vscale
 ; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 4 x i32> @llvm.experimental.vector.interleave2.nxv4i32(<vscale x 2 x i32> %vec0, <vscale x 2 x i32> %vec1)
+  %retval = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %vec0, <vscale x 2 x i32> %vec1)
   ret <vscale x 4 x i32> %retval
 }
 
 ; Float declarations
-declare <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
-declare <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
-declare <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
-declare <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
+declare <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
+declare <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
+declare <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
 
 ; Integer declarations
-declare <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
 
 ; Predicated
-declare <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
-declare <vscale x 16 x i1> @llvm.experimental.vector.interleave2.nxv16i1(<vscale x 8 x i1>, <vscale x 8 x i1>)
-declare <vscale x 8 x i1> @llvm.experimental.vector.interleave2.nxv8i1(<vscale x 4 x i1>, <vscale x 4 x i1>)
-declare <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1>, <vscale x 2 x i1>)
+declare <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 16 x i1> @llvm.vector.interleave2.nxv16i1(<vscale x 8 x i1>, <vscale x 8 x i1>)
+declare <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1>, <vscale x 4 x i1>)
+declare <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1>, <vscale x 2 x i1>)
 
 ; Illegal type size
-declare <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32>, <vscale x 8 x i32>)
-declare <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
+declare <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32>, <vscale x 8 x i32>)
+declare <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
 
-declare <vscale x 16 x i8> @llvm.experimental.vector.interleave2.nxv16i8(<vscale x 8 x i8>, <vscale x 8 x i8>)
-declare <vscale x 8 x i16> @llvm.experimental.vector.interleave2.nxv8i16(<vscale x 4 x i16>, <vscale x 4 x i16>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.interleave2.nxv4i32(<vscale x 2 x i32>, <vscale x 2 x i32>)
+declare <vscale x 16 x i8> @llvm.vector.interleave2.nxv16i8(<vscale x 8 x i8>, <vscale x 8 x i8>)
+declare <vscale x 8 x i16> @llvm.vector.interleave2.nxv8i16(<vscale x 4 x i16>, <vscale x 4 x i16>)
+declare <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32>, <vscale x 2 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-while-reversed.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-while-reversed.ll
index cb74cd8032ab..5f7476397891 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-while-reversed.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-while-reversed.ll
@@ -16,7 +16,7 @@ define <vscale x 16 x i1> @whilege_b_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilele.nxv16i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -28,7 +28,7 @@ define <vscale x 16 x i1> @whilege_b_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilele.nxv16i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -40,7 +40,7 @@ define <vscale x 8 x i1> @whilege_h_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilele.nxv8i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -52,7 +52,7 @@ define <vscale x 8 x i1> @whilege_h_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilele.nxv8i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -64,7 +64,7 @@ define <vscale x 4 x i1> @whilege_s_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilele.nxv4i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -76,7 +76,7 @@ define <vscale x 4 x i1> @whilege_s_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilele.nxv4i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -88,7 +88,7 @@ define <vscale x 2 x i1> @whilege_d_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilele.nxv2i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -100,7 +100,7 @@ define <vscale x 2 x i1> @whilege_d_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilele.nxv2i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -116,7 +116,7 @@ define <vscale x 16 x i1> @whilehs_b_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilels.nxv16i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -128,7 +128,7 @@ define <vscale x 16 x i1> @whilehs_b_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilels.nxv16i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -140,7 +140,7 @@ define <vscale x 8 x i1> @whilehs_h_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilels.nxv8i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -152,7 +152,7 @@ define <vscale x 8 x i1> @whilehs_h_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilels.nxv8i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -164,7 +164,7 @@ define <vscale x 4 x i1> @whilehs_s_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilels.nxv4i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -176,7 +176,7 @@ define <vscale x 4 x i1> @whilehs_s_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilels.nxv4i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -188,7 +188,7 @@ define <vscale x 2 x i1> @whilehs_d_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilels.nxv2i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -200,7 +200,7 @@ define <vscale x 2 x i1> @whilehs_d_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilels.nxv2i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -214,7 +214,7 @@ define <vscale x 16 x i1> @whilegt_b_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilegt p0.b, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelt.nxv16i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -224,7 +224,7 @@ define <vscale x 16 x i1> @whilegt_b_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilegt p0.b, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelt.nxv16i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -234,7 +234,7 @@ define <vscale x 8 x i1> @whilegt_h_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilegt p0.h, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilelt.nxv8i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -244,7 +244,7 @@ define <vscale x 8 x i1> @whilegt_h_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilegt p0.h, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilelt.nxv8i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -254,7 +254,7 @@ define <vscale x 4 x i1> @whilegt_s_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilegt p0.s, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -264,7 +264,7 @@ define <vscale x 4 x i1> @whilegt_s_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilegt p0.s, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -274,7 +274,7 @@ define <vscale x 2 x i1> @whilegt_d_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilegt p0.d, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -284,7 +284,7 @@ define <vscale x 2 x i1> @whilegt_d_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilegt p0.d, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -298,7 +298,7 @@ define <vscale x 16 x i1> @whilehi_b_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilehi p0.b, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -308,7 +308,7 @@ define <vscale x 16 x i1> @whilehi_b_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilehi p0.b, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -318,7 +318,7 @@ define <vscale x 8 x i1> @whilehi_h_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilehi p0.h, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilelo.nxv8i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -328,7 +328,7 @@ define <vscale x 8 x i1> @whilehi_h_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilehi p0.h, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilelo.nxv8i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -338,7 +338,7 @@ define <vscale x 4 x i1> @whilehi_s_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilehi p0.s, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilelo.nxv4i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -348,7 +348,7 @@ define <vscale x 4 x i1> @whilehi_s_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilehi p0.s, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilelo.nxv4i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -358,7 +358,7 @@ define <vscale x 2 x i1> @whilehi_d_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilehi p0.d, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelo.nxv2i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -368,7 +368,7 @@ define <vscale x 2 x i1> @whilehi_d_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilehi p0.d, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelo.nxv2i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -384,7 +384,7 @@ define <vscale x 16 x i1> @whilele_b_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilege.nxv16i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -396,7 +396,7 @@ define <vscale x 16 x i1> @whilele_b_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilege.nxv16i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -408,7 +408,7 @@ define <vscale x 8 x i1> @whilele_h_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilege.nxv8i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -420,7 +420,7 @@ define <vscale x 8 x i1> @whilele_h_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilege.nxv8i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -432,7 +432,7 @@ define <vscale x 4 x i1> @whilele_s_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilege.nxv4i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -444,7 +444,7 @@ define <vscale x 4 x i1> @whilele_s_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilege.nxv4i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -456,7 +456,7 @@ define <vscale x 2 x i1> @whilele_d_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilege.nxv2i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -468,7 +468,7 @@ define <vscale x 2 x i1> @whilele_d_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilege.nxv2i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -482,7 +482,7 @@ define <vscale x 16 x i1> @whilelo_b_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilelo p0.b, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilehi.nxv16i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -492,7 +492,7 @@ define <vscale x 16 x i1> @whilelo_b_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilelo p0.b, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilehi.nxv16i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -502,7 +502,7 @@ define <vscale x 8 x i1> @whilelo_h_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilelo p0.h, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilehi.nxv8i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -512,7 +512,7 @@ define <vscale x 8 x i1> @whilelo_h_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilelo p0.h, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilehi.nxv8i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -522,7 +522,7 @@ define <vscale x 4 x i1> @whilelo_s_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilelo p0.s, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilehi.nxv4i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -532,7 +532,7 @@ define <vscale x 4 x i1> @whilelo_s_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilelo p0.s, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilehi.nxv4i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -542,7 +542,7 @@ define <vscale x 2 x i1> @whilelo_d_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilelo p0.d, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilehi.nxv2i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -552,7 +552,7 @@ define <vscale x 2 x i1> @whilelo_d_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilelo p0.d, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilehi.nxv2i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -568,7 +568,7 @@ define <vscale x 16 x i1> @whilels_b_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilehs.nxv16i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -580,7 +580,7 @@ define <vscale x 16 x i1> @whilels_b_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilehs.nxv16i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -592,7 +592,7 @@ define <vscale x 8 x i1> @whilels_h_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilehs.nxv8i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -604,7 +604,7 @@ define <vscale x 8 x i1> @whilels_h_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilehs.nxv8i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -616,7 +616,7 @@ define <vscale x 4 x i1> @whilels_s_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilehs.nxv4i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -628,7 +628,7 @@ define <vscale x 4 x i1> @whilels_s_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilehs.nxv4i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -640,7 +640,7 @@ define <vscale x 2 x i1> @whilels_d_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilehs.nxv2i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -652,7 +652,7 @@ define <vscale x 2 x i1> @whilels_d_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilehs.nxv2i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -666,7 +666,7 @@ define <vscale x 16 x i1> @whilelt_b_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilelt p0.b, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilegt.nxv16i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -676,7 +676,7 @@ define <vscale x 16 x i1> @whilelt_b_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilelt p0.b, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilegt.nxv16i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -686,7 +686,7 @@ define <vscale x 8 x i1> @whilelt_h_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilelt p0.h, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilegt.nxv8i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -696,7 +696,7 @@ define <vscale x 8 x i1> @whilelt_h_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilelt p0.h, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilegt.nxv8i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -706,7 +706,7 @@ define <vscale x 4 x i1> @whilelt_s_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilelt p0.s, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilegt.nxv4i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -716,7 +716,7 @@ define <vscale x 4 x i1> @whilelt_s_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilelt p0.s, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilegt.nxv4i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -726,7 +726,7 @@ define <vscale x 2 x i1> @whilelt_d_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilelt p0.d, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilegt.nxv2i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -736,6 +736,6 @@ define <vscale x 2 x i1> @whilelt_d_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilelt p0.d, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilegt.nxv2i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
index b7b2cb22c1b6..9d4f9434aa31 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -142,8 +142,8 @@ attributes #0 = { nounwind }
 
 ; GCN: amdpal.pipelines:
 ; GCN-NEXT:  - .registers:
-; GCN-NEXT:      0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}}
-; GCN-NEXT:      0x2e13 (COMPUTE_PGM_RSRC2): 0x8001{{$}}
+; GCN-NEXT:      '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
+; GCN-NEXT:      '0x2e13 (COMPUTE_PGM_RSRC2)': 0x8001{{$}}
 ; GCN-NEXT:    .shader_functions:
 ; GCN-NEXT:      dynamic_stack:
 ; GCN-NEXT:        .backend_stack_size: 0x10{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll
index 98aa04f6d26e..a3fd2a942bc2 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll
@@ -11,8 +11,8 @@
 ; GCN-NEXT:         .entry_point:    cs_amdpal
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
-; GCN-NEXT:       0x2e12 (COMPUTE_PGM_RSRC1):
-; GCN-NEXT:       0x2e13 (COMPUTE_PGM_RSRC2):
+; GCN-NEXT:       '0x2e12 (COMPUTE_PGM_RSRC1)':
+; GCN-NEXT:       '0x2e13 (COMPUTE_PGM_RSRC2)':
 ; GCN-NEXT: ...
 ; GCN-NEXT:         .end_amdgpu_pal_metadata
 define amdgpu_cs half @cs_amdpal(half %arg0) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-es.ll b/llvm/test/CodeGen/AMDGPU/amdpal-es.ll
index 012b2061756b..679e0858819e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-es.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-es.ll
@@ -10,7 +10,7 @@
 ; GCN-NEXT:         .entry_point:    es_amdpal
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
-; GCN-NEXT:       0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0
+; GCN-NEXT:       '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0
 ; GCN-NEXT: ...
 ; GCN-NEXT:         .end_amdgpu_pal_metadata
 define amdgpu_es half @es_amdpal(half %arg0) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll
index e2f67398d18a..75f7a1dc266d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll
@@ -11,7 +11,7 @@
 ; GCN-NEXT:         .entry_point:    gs_amdpal
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
-; GCN-NEXT:       0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0
+; GCN-NEXT:       '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0
 ; GCN-NEXT: ...
 ; GCN-NEXT:         .end_amdgpu_pal_metadata
 define amdgpu_gs half @gs_amdpal(half %arg0) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll
index 9ad47c1d604f..c61578a967b6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll
@@ -11,7 +11,7 @@
 ; GCN-NEXT:         .entry_point:    hs_amdpal
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
-; GCN-NEXT:       0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0
+; GCN-NEXT:       '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0
 ; GCN-NEXT: ...
 ; GCN-NEXT:         .end_amdgpu_pal_metadata
 define amdgpu_hs half @hs_amdpal(half %arg0) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll b/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll
index 8ee6f7283ce7..8162c824dc2c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll
@@ -10,7 +10,7 @@
 ; GCN-NEXT:         .entry_point:    ls_amdpal
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
-; GCN-NEXT:       0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0
+; GCN-NEXT:       '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0
 ; GCN-NEXT: ...
 ; GCN-NEXT:         .end_amdgpu_pal_metadata
 define amdgpu_ls half @ls_amdpal(half %arg0) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-cs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-cs.ll
index 0d0c70c38ace..5e21ba494df1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-cs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-cs.ll
@@ -5,7 +5,7 @@
 ; amdpal compute shader: check for 0x2e12 (COMPUTE_PGM_RSRC1) in pal metadata
 ; GCN-LABEL: {{^}}cs_amdpal:
 ; GCN: .amdgpu_pal_metadata
-; GCN: 0x2e12 (COMPUTE_PGM_RSRC1)
+; GCN: '0x2e12 (COMPUTE_PGM_RSRC1)'
 define amdgpu_cs half @cs_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-default.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-default.ll
index b82e3ebdde4b..dc9a33ac0141 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-default.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-default.ll
@@ -3,45 +3,45 @@
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -enable-var-scope %s
 
 ; amdpal compute shader: check for 0x2e12 (COMPUTE_PGM_RSRC1) in pal metadata
-; SI-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0x2f0000{{$}}
-; VI-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0x2f02c0{{$}}
-; GFX9-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0x2f0000{{$}}
+; SI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x2f0000{{$}}
+; VI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x2f02c0{{$}}
+; GFX9-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x2f0000{{$}}
 define amdgpu_cs half @cs_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal evaluation shader: check for 0x2cca (SPI_SHADER_PGM_RSRC1_ES) in pal metadata
-; SI-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0x2f0000{{$}}
-; VI-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0x2f02c0{{$}}
-; GFX9-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0x2f0000{{$}}
+; SI-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0x2f0000{{$}}
+; VI-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0x2f02c0{{$}}
+; GFX9-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0x2f0000{{$}}
 define amdgpu_es half @es_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal geometry shader: check for 0x2c8a (SPI_SHADER_PGM_RSRC1_GS) in pal metadata
-; SI-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0x2f0000{{$}}
-; VI-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0x2f02c0{{$}}
-; GFX9-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0x2f0000{{$}}
+; SI-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0x2f0000{{$}}
+; VI-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0x2f02c0{{$}}
+; GFX9-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0x2f0000{{$}}
 define amdgpu_gs half @gs_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal hull shader: check for 0x2d0a (SPI_SHADER_PGM_RSRC1_HS) in pal metadata
-; SI-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0x2f0000{{$}}
-; VI-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0x2f02c0{{$}}
-; GFX9-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0x2f0000{{$}}
+; SI-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0x2f0000{{$}}
+; VI-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0x2f02c0{{$}}
+; GFX9-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0x2f0000{{$}}
 define amdgpu_hs half @hs_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal load shader: check for 0x2d4a (SPI_SHADER_PGM_RSRC1_LS) in pal metadata
-; SI-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0x2f0000{{$}}
-; VI-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0x2f02c0{{$}}
-; GFX9-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0x2f0000{{$}}
+; SI-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0x2f0000{{$}}
+; VI-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0x2f02c0{{$}}
+; GFX9-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0x2f0000{{$}}
 define amdgpu_ls half @ls_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -49,18 +49,18 @@ define amdgpu_ls half @ls_amdpal(half %arg0) {
 
 ; amdpal pixel shader: check for 0x2c0a (SPI_SHADER_PGM_RSRC1_PS) in pal metadata
 ; below.
-; SI-DAG:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0x2f0000{{$}}
-; VI-DAG:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0x2f02c0{{$}}
-; GFX9-DAG:         0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0x2f0000{{$}}
+; SI-DAG:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0x2f0000{{$}}
+; VI-DAG:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0x2f02c0{{$}}
+; GFX9-DAG:         '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0x2f0000{{$}}
 define amdgpu_ps half @ps_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal vertex shader: check for 45352 (SPI_SHADER_PGM_RSRC1_VS) in pal metadata
-; SI-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0x2f0000{{$}}
-; VI-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0x2f02c0{{$}}
-; GFX9-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0x2f0000{{$}}
+; SI-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0x2f0000{{$}}
+; VI-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0x2f02c0{{$}}
+; GFX9-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0x2f0000{{$}}
 define amdgpu_vs half @vs_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -75,7 +75,7 @@ define amdgpu_vs half @vs_amdpal(half %arg0) {
 ;       - 0x123456789abcdef0
 ;       - 0xfedcba9876543210
 ;     .registers:
-;       0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x42000000
+;       '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x42000000
 ; ...
 ; 	.end_amdgpu_pal_metadata
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-denormal.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-denormal.ll
index b86b42868005..ffce3ed08509 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-denormal.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-denormal.ll
@@ -3,45 +3,45 @@
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -enable-var-scope %s
 
 ; amdpal compute shader: check for 0x2e12 (COMPUTE_PGM_RSRC1) in pal metadata
-; SI-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0x2c0000{{$}}
-; VI-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0x2c02c0{{$}}
-; GFX9-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0x2c0000{{$}}
+; SI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x2c0000{{$}}
+; VI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x2c02c0{{$}}
+; GFX9-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x2c0000{{$}}
 define amdgpu_cs half @cs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal evaluation shader: check for 0x2cca (SPI_SHADER_PGM_RSRC1_ES) in pal metadata
-; SI-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0x2c0000{{$}}
-; VI-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0x2c02c0{{$}}
-; GFX9-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0x2c0000{{$}}
+; SI-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0x2c0000{{$}}
+; VI-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0x2c02c0{{$}}
+; GFX9-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0x2c0000{{$}}
 define amdgpu_es half @es_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal geometry shader: check for 0x2c8a (SPI_SHADER_PGM_RSRC1_GS) in pal metadata
-; SI-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0x2c0000{{$}}
-; VI-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0x2c02c0{{$}}
-; GFX9-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0x2c0000{{$}}
+; SI-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0x2c0000{{$}}
+; VI-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0x2c02c0{{$}}
+; GFX9-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0x2c0000{{$}}
 define amdgpu_gs half @gs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal hull shader: check for 0x2d0a (SPI_SHADER_PGM_RSRC1_HS) in pal metadata
-; SI-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0x2c0000{{$}}
-; VI-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0x2c02c0{{$}}
-; GFX9-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0x2c0000{{$}}
+; SI-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0x2c0000{{$}}
+; VI-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0x2c02c0{{$}}
+; GFX9-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0x2c0000{{$}}
 define amdgpu_hs half @hs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal load shader: check for 0x2d4a (SPI_SHADER_PGM_RSRC1_LS) in pal metadata
-; SI-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0x2c0000{{$}}
-; VI-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0x2c02c0{{$}}
-; GFX9-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0x2c0000{{$}}
+; SI-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0x2c0000{{$}}
+; VI-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0x2c02c0{{$}}
+; GFX9-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0x2c0000{{$}}
 define amdgpu_ls half @ls_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -49,18 +49,18 @@ define amdgpu_ls half @ls_amdpal(half %arg0) #0 {
 
 ; amdpal pixel shader: check for 0x2c0a (SPI_SHADER_PGM_RSRC1_PS) in pal metadata
 ; below.
-; SI-DAG:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0x2c0000{{$}}
-; VI-DAG:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0x2c02c0{{$}}
-; GFX9-DAG:         0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0x2c0000{{$}}
+; SI-DAG:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0x2c0000{{$}}
+; VI-DAG:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0x2c02c0{{$}}
+; GFX9-DAG:         '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0x2c0000{{$}}
 define amdgpu_ps half @ps_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal vertex shader: check for 45352 (SPI_SHADER_PGM_RSRC1_VS) in pal metadata
-; SI-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0x2c0000{{$}}
-; VI-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0x2c02c0{{$}}
-; GFX9-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0x2c0000{{$}}
+; SI-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0x2c0000{{$}}
+; VI-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0x2c02c0{{$}}
+; GFX9-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0x2c0000{{$}}
 define amdgpu_vs half @vs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -77,7 +77,7 @@ attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 ;       - 0x123456789abcdef0
 ;       - 0xfedcba9876543210
 ;     .registers:
-;       0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x42000000
+;       '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x42000000
 ; ...
 ; 	.end_amdgpu_pal_metadata
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-dx10-clamp.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-dx10-clamp.ll
index b1db7aafacab..3ea3064fa743 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-dx10-clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-dx10-clamp.ll
@@ -3,45 +3,45 @@
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -enable-var-scope %s
 
 ; amdpal compute shader: check for 0x2e12 (COMPUTE_PGM_RSRC1) in pal metadata
-; SI-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0xf0000{{$}}
-; VI-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0xf02c0{{$}}
-; GFX9-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0xf0000{{$}}
+; SI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xf0000{{$}}
+; VI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xf02c0{{$}}
+; GFX9-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xf0000{{$}}
 define amdgpu_cs half @cs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal evaluation shader: check for 0x2cca (SPI_SHADER_PGM_RSRC1_ES) in pal metadata
-; SI-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0xf0000{{$}}
-; VI-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0xf02c0{{$}}
-; GFX9-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0xf0000{{$}}
+; SI-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0xf0000{{$}}
+; VI-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0xf02c0{{$}}
+; GFX9-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0xf0000{{$}}
 define amdgpu_es half @es_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal geometry shader: check for 0x2c8a (SPI_SHADER_PGM_RSRC1_GS) in pal metadata
-; SI-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0xf0000{{$}}
-; VI-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0xf02c0{{$}}
-; GFX9-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0xf0000{{$}}
+; SI-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0xf0000{{$}}
+; VI-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0xf02c0{{$}}
+; GFX9-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0xf0000{{$}}
 define amdgpu_gs half @gs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal hull shader: check for 0x2d0a (SPI_SHADER_PGM_RSRC1_HS) in pal metadata
-; SI-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0xf0000{{$}}
-; VI-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0xf02c0{{$}}
-; GFX9-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0xf0000{{$}}
+; SI-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0xf0000{{$}}
+; VI-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0xf02c0{{$}}
+; GFX9-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0xf0000{{$}}
 define amdgpu_hs half @hs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal load shader: check for 0x2d4a (SPI_SHADER_PGM_RSRC1_LS) in pal metadata
-; SI-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0xf0000{{$}}
-; VI-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0xf02c0{{$}}
-; GFX9-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0xf0000{{$}}
+; SI-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0xf0000{{$}}
+; VI-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0xf02c0{{$}}
+; GFX9-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0xf0000{{$}}
 define amdgpu_ls half @ls_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -49,18 +49,18 @@ define amdgpu_ls half @ls_amdpal(half %arg0) #0 {
 
 ; amdpal pixel shader: check for 0x2c0a (SPI_SHADER_PGM_RSRC1_PS) in pal metadata
 ; below.
-; SI-DAG:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0xf0000{{$}}
-; VI-DAG:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0xf02c0{{$}}
-; GFX9-DAG:         0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0xf0000{{$}}
+; SI-DAG:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0xf0000{{$}}
+; VI-DAG:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0xf02c0{{$}}
+; GFX9-DAG:         '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0xf0000{{$}}
 define amdgpu_ps half @ps_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal vertex shader: check for 45352 (SPI_SHADER_PGM_RSRC1_VS) in pal metadata
-; SI-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0xf0000{{$}}
-; VI-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0xf02c0{{$}}
-; GFX9-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0xf0000{{$}}
+; SI-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0xf0000{{$}}
+; VI-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0xf02c0{{$}}
+; GFX9-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0xf0000{{$}}
 define amdgpu_vs half @vs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -77,7 +77,7 @@ attributes #0 = { "amdgpu-dx10-clamp"="false" }
 ;       - 0x123456789abcdef0
 ;       - 0xfedcba9876543210
 ;     .registers:
-;       0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x42000000
+;       '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x42000000
 ; ...
 ; 	.end_amdgpu_pal_metadata
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-es.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-es.ll
index f97117f3d909..bcc8da6e1bf4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-es.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-es.ll
@@ -4,7 +4,7 @@
 ; amdpal evaluation shader: check for 0x2cca (SPI_SHADER_PGM_RSRC1_ES) in pal metadata
 ; GCN-LABEL: {{^}}es_amdpal:
 ; GCN: .amdgpu_pal_metadata
-; GCN: 0x2cca (SPI_SHADER_PGM_RSRC1_ES)
+; GCN: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)'
 define amdgpu_es half @es_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-gs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-gs.ll
index a32d10390b98..ef4c9cbd5006 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-gs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-gs.ll
@@ -5,7 +5,7 @@
 ; amdpal geometry shader: check for 0x2c8a (SPI_SHADER_PGM_RSRC1_GS) in pal metadata
 ; GCN-LABEL: {{^}}gs_amdpal:
 ; GCN: .amdgpu_pal_metadata
-; GCN: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS)
+; GCN: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)'
 define amdgpu_gs half @gs_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-hs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-hs.ll
index be08c93cdb31..eb814c11bceb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-hs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-hs.ll
@@ -5,7 +5,7 @@
 ; amdpal hull shader: check for 0x2d0a (SPI_SHADER_PGM_RSRC1_HS) in pal metadata
 ; GCN-LABEL: {{^}}hs_amdpal:
 ; GCN: .amdgpu_pal_metadata
-; GCN: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS)
+; GCN: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)'
 define amdgpu_hs half @hs_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll
index 95d533544c30..d4826a22db79 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll
@@ -4,50 +4,50 @@
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 -enable-var-scope %s
 
 ; amdpal compute shader: check for 0x2e12 (COMPUTE_PGM_RSRC1) in pal metadata
-; SI-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf0000{{$}}
-; VI-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf02c0{{$}}
-; GFX9-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf0000{{$}}
-; GFX12-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0x600f0000{{$}}
+; SI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf0000{{$}}
+; VI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf02c0{{$}}
+; GFX9-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf0000{{$}}
+; GFX12-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x600f0000{{$}}
 define amdgpu_cs half @cs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal evaluation shader: check for 0x2cca (SPI_SHADER_PGM_RSRC1_ES) in pal metadata
-; SI-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0xaf0000{{$}}
-; VI-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0xaf02c0{{$}}
-; GFX9-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0xaf0000{{$}}
-; GFX12-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0xf0000{{$}}
+; SI-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0xaf0000{{$}}
+; VI-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0xaf02c0{{$}}
+; GFX9-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0xaf0000{{$}}
+; GFX12-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0xf0000{{$}}
 define amdgpu_es half @es_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal geometry shader: check for 0x2c8a (SPI_SHADER_PGM_RSRC1_GS) in pal metadata
-; SI-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0xaf0000{{$}}
-; VI-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0xaf02c0{{$}}
-; GFX9-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0xaf0000{{$}}
-; GFX12-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0xa0f0000{{$}}
+; SI-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0xaf0000{{$}}
+; VI-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0xaf02c0{{$}}
+; GFX9-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0xaf0000{{$}}
+; GFX12-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0xa0f0000{{$}}
 define amdgpu_gs half @gs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal hull shader: check for 0x2d0a (SPI_SHADER_PGM_RSRC1_HS) in pal metadata
-; SI-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0xaf0000{{$}}
-; VI-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0xaf02c0{{$}}
-; GFX9-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0xaf0000{{$}}
-; GFX12-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0x50f0000{{$}}
+; SI-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0xaf0000{{$}}
+; VI-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0xaf02c0{{$}}
+; GFX9-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0xaf0000{{$}}
+; GFX12-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0x50f0000{{$}}
 define amdgpu_hs half @hs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal load shader: check for 0x2d4a (SPI_SHADER_PGM_RSRC1_LS) in pal metadata
-; SI-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0xaf0000{{$}}
-; VI-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0xaf02c0{{$}}
-; GFX9-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0xaf0000{{$}}
-; GFX12-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0xf0000{{$}}
+; SI-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0xaf0000{{$}}
+; VI-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0xaf02c0{{$}}
+; GFX9-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0xaf0000{{$}}
+; GFX12-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0xf0000{{$}}
 define amdgpu_ls half @ls_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -55,20 +55,20 @@ define amdgpu_ls half @ls_amdpal(half %arg0) #0 {
 
 ; amdpal pixel shader: check for 0x2c0a (SPI_SHADER_PGM_RSRC1_PS) in pal metadata
 ; below.
-; SI-DAG:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0xaf0000{{$}}
-; VI-DAG:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0xaf02c0{{$}}
-; GFX9-DAG:         0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0xaf0000{{$}}
-; GFX12-DAG:        0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0x20f0000{{$}}
+; SI-DAG:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0xaf0000{{$}}
+; VI-DAG:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0xaf02c0{{$}}
+; GFX9-DAG:         '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0xaf0000{{$}}
+; GFX12-DAG:        '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0x20f0000{{$}}
 define amdgpu_ps half @ps_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal vertex shader: check for 45352 (SPI_SHADER_PGM_RSRC1_VS) in pal metadata
-; SI-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0xaf0000{{$}}
-; VI-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0xaf02c0{{$}}
-; GFX9-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0xaf0000{{$}}
-; GFX12-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0x80f0000{{$}}
+; SI-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0xaf0000{{$}}
+; VI-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0xaf02c0{{$}}
+; GFX9-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0xaf0000{{$}}
+; GFX12-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0x80f0000{{$}}
 define amdgpu_vs half @vs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -85,7 +85,7 @@ attributes #0 = { "amdgpu-ieee"="true" }
 ;       - 0x123456789abcdef0
 ;       - 0xfedcba9876543210
 ;     .registers:
-;       0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x42000000
+;       '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x42000000
 ; ...
 ; 	.end_amdgpu_pal_metadata
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ls.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ls.ll
index 46097fa20608..0d81e70b2e4f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ls.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ls.ll
@@ -4,7 +4,7 @@
 ; amdpal load shader: check for 0x2d4a (SPI_SHADER_PGM_RSRC1_LS) in pal metadata
 ; GCN-LABEL: {{^}}ls_amdpal:
 ; GCN: .amdgpu_pal_metadata
-; GCN: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS)
+; GCN: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)'
 define amdgpu_ls half @ls_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ps.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ps.ll
index 9169c651f129..d31732f995b1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ps.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ps.ll
@@ -12,8 +12,8 @@
 ; GCN-NEXT:      - 0x123456789abcdef0
 ; GCN-NEXT:      - 0xfedcba9876543210
 ; GCN:         .registers:
-; GCN:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS):
-; GCN:           0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x42
+; GCN:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)':
+; GCN:           '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x42
 define amdgpu_ps half @ps_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -23,12 +23,12 @@ define amdgpu_ps half @ps_amdpal(half %arg0) {
 ;
 ; 	.amdgpu_pal_metadata
 ; ---
-; amdpal.pipelines: 
+; amdpal.pipelines:
 ;   - .internal_pipeline_hash:
 ;       - 0x123456789abcdef0
 ;       - 0xfedcba9876543210
 ;     .registers:
-;       0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x42000000
+;       '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x42000000
 ; ...
 ; 	.end_amdgpu_pal_metadata
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-psenable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-psenable.ll
index d6322e2b4d3e..15b1a652077e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-psenable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-psenable.ll
@@ -7,8 +7,8 @@
 ; the workaround that ensures that an interpolation mode is also set in PSEnable.
 ; GCN-LABEL: {{^}}amdpal_psenable:
 ; GCN: .amdgpu_pal_metadata
-; GCN: 0xa1b3 (SPI_PS_INPUT_ENA): 0x2
-; GCN: 0xa1b4 (SPI_PS_INPUT_ADDR): 0x2
+; GCN: '0xa1b3 (SPI_PS_INPUT_ENA)': 0x2
+; GCN: '0xa1b4 (SPI_PS_INPUT_ADDR)': 0x2
 define amdgpu_ps void @amdpal_psenable(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <2 x float> %pos) #6 {
   %inst23 = extractelement <2 x float> %pos, i32 0
   %inst24 = extractelement <2 x float> %pos, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-vs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-vs.ll
index 7c47129c28ce..42de6007f7e2 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-vs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-vs.ll
@@ -5,7 +5,7 @@
 ; amdpal vertex shader: check for 45352 (SPI_SHADER_PGM_RSRC1_VS) in pal metadata
 ; GCN-LABEL: {{^}}vs_amdpal:
 ; GCN: .amdgpu_pal_metadata
-; GCN: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS)
+; GCN: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)'
 define amdgpu_vs half @vs_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll
index 13d2050c491f..ace21207a7eb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll
@@ -14,10 +14,10 @@
 ; GCN-NEXT:         .entry_point:    amdpal_psenable
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
-; GCN-NEXT:       0x2c0a (SPI_SHADER_PGM_RSRC1_PS):
-; GCN-NEXT:       0x2c0b (SPI_SHADER_PGM_RSRC2_PS):
-; GCN-NEXT:       0xa1b3 (SPI_PS_INPUT_ENA): 0x2
-; GCN-NEXT:       0xa1b4 (SPI_PS_INPUT_ADDR): 0x2
+; GCN-NEXT:       '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)':
+; GCN-NEXT:       '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)':
+; GCN-NEXT:       '0xa1b3 (SPI_PS_INPUT_ENA)': 0x2
+; GCN-NEXT:       '0xa1b4 (SPI_PS_INPUT_ADDR)': 0x2
 ; GCN-NEXT: ...
 ; GCN-NEXT:         .end_amdgpu_pal_metadata
 define amdgpu_ps void @amdpal_psenable(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <2 x float> %pos) #6 {
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-usersgpr-init.ll b/llvm/test/CodeGen/AMDGPU/amdpal-usersgpr-init.ll
index 52a9d57244c2..086a126b1ddc 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-usersgpr-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-usersgpr-init.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
 
 ; We want to make sure that RSRC2 is left untouched
-; GCN:       0x2e13 (COMPUTE_PGM_RSRC2): 0x78a
+; GCN:       '0x2e13 (COMPUTE_PGM_RSRC2)': 0x78a
 define amdgpu_cs half @cs_amdpal(half %arg0, half inreg %arg1) {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -9,4 +9,4 @@ define amdgpu_cs half @cs_amdpal(half %arg0, half inreg %arg1) {
 
 !amdgpu.pal.metadata.msgpack = !{!0}
 
-!0 = !{!"\82\B0amdpal.pipelines\91\89\A4.api\A6Vulkan\B0.hardware_stages\81\A3.cs\83\AB.sgpr_limith\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E83\B3\C2\D1)\7FG\CF[\8A\DF\EE[\7FD,\AA.registers\8A\CD.\07\01\CD.\08\01\CD.\09\01\CD.\12\CE@,\00\00\CD.\13\CD\07\8A\CD.(\00\CD.*\CE\16\0B\22Y\CD.@\CE\10\00\00\00\CD.B\CE\10\00\00\06\CD.D\00\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\D3s\A6\8D\C5x\84\D4\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\E5\A0\EB\F9}\C6\C1\13\CF\1A_\E7\F7\F2.mR\AD.llpc_version\A454.5\AEamdpal.version\92\02\03"}
-\ No newline at end of file
+!0 = !{!"\82\B0amdpal.pipelines\91\89\A4.api\A6Vulkan\B0.hardware_stages\81\A3.cs\83\AB.sgpr_limith\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E83\B3\C2\D1)\7FG\CF[\8A\DF\EE[\7FD,\AA.registers\8A\CD.\07\01\CD.\08\01\CD.\09\01\CD.\12\CE@,\00\00\CD.\13\CD\07\8A\CD.(\00\CD.*\CE\16\0B\22Y\CD.@\CE\10\00\00\00\CD.B\CE\10\00\00\06\CD.D\00\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\D3s\A6\8D\C5x\84\D4\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\E5\A0\EB\F9}\C6\C1\13\CF\1A_\E7\F7\F2.mR\AD.llpc_version\A454.5\AEamdpal.version\92\02\03"}
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll
index ec8f698d69c2..c300ba187740 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll
@@ -11,7 +11,7 @@
 ; GCN-NEXT:         .entry_point:    vs_amdpal
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
-; GCN-NEXT:       0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0
+; GCN-NEXT:       '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0
 ; GCN-NEXT: ...
 ; GCN-NEXT:         .end_amdgpu_pal_metadata
 define amdgpu_vs half @vs_amdpal(half %arg0) {
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index cf99b5d80e13..b2f9bf89d9ec 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -282,21 +282,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v16
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v14
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v9, vcc, v9, v4
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v13, vcc, v10, v4, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v6, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v13, vcc, v10, v4, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v5, v1
@@ -312,21 +312,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v11
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 killed $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v12
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v1, vcc, v1, v3
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v8, v3, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v8, vcc, v8, v5, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v7, v3, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v2, v5, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v7
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v5, v5, v6
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v3, v3, v4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
@@ -339,18 +339,26 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX9-O0-NEXT:    v_or_b32_e64 v3, v8, v7
@@ -403,7 +411,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[12:13], v[11:12], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s[12:13]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
@@ -439,7 +448,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[13:14], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
@@ -690,10 +700,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
@@ -903,14 +913,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
@@ -1028,10 +1038,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
index e376c3df1ac9..96ec90b1f4d0 100644
--- a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
@@ -5,12 +5,12 @@
 
 ; Check EXTRA_LDS_SIZE in SPI_SHADER_PGM_RSRC2_PS.
 
-; GFX10-PAL: 0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x800
+; GFX10-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x800
 
 ; GFX10-MESA: .long 45100
 ; GFX10-MESA-NEXT: .long 2048
 
-; GFX11-PAL: 0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x400
+; GFX11-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x400
 
 ; GFX11-MESA: .long 45100
 ; GFX11-MESA-NEXT: .long 1024
diff --git a/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp32_to_bf16.ll b/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp32_to_bf16.ll
new file mode 100644
index 000000000000..2ccc0337b8ae
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp32_to_bf16.ll
@@ -0,0 +1,481 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s -o - | FileCheck %s
+
+define void @scalar(float %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: scalar:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v3, v2
+; CHECK-NEXT:    v_mov_b32_e32 v2, v1
+; CHECK-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v1, v1, v0, s4
+; CHECK-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
+; CHECK-NEXT:    global_store_short_d16_hi v[2:3], v0, off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc float %num to bfloat
+  store bfloat %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v2(<2 x float> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v2:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v4, v4, v0, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; CHECK-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v1, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
+; CHECK-NEXT:    s_mov_b32 s4, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v0, v1, v0, s4
+; CHECK-NEXT:    global_store_dword v[2:3], v0, off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <2 x float> %num to <2 x bfloat>
+  store <2 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v3(<3 x float> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v3:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v5, v4
+; CHECK-NEXT:    v_mov_b32_e32 v4, v3
+; CHECK-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v3, v3, v0, s4
+; CHECK-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v3, v6, vcc
+; CHECK-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; CHECK-NEXT:    v_add3_u32 v3, v3, v1, s4
+; CHECK-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; CHECK-NEXT:    s_mov_b32 s5, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v0, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v1, v2, 16, 1
+; CHECK-NEXT:    v_add3_u32 v1, v1, v2, s4
+; CHECK-NEXT:    v_or_b32_e32 v3, 0x400000, v2
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; CHECK-NEXT:    global_store_short_d16_hi v[4:5], v1, off offset:4
+; CHECK-NEXT:    global_store_dword v[4:5], v0, off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <3 x float> %num to <3 x bfloat>
+  store <3 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v4(<4 x float> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v4:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v6, v6, v2, s4
+; CHECK-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
+; CHECK-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; CHECK-NEXT:    v_add3_u32 v6, v6, v3, s4
+; CHECK-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; CHECK-NEXT:    s_mov_b32 s5, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v3, v3, v2, s5
+; CHECK-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v2, v2, v0, s4
+; CHECK-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
+; CHECK-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; CHECK-NEXT:    v_add3_u32 v2, v2, v1, s4
+; CHECK-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v2, v6, vcc
+; CHECK-NEXT:    v_perm_b32 v2, v1, v0, s5
+; CHECK-NEXT:    global_store_dwordx2 v[4:5], v[2:3], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <4 x float> %num to <4 x bfloat>
+  store <4 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v8(<8 x float> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v8:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v10, v10, v6, s4
+; CHECK-NEXT:    v_or_b32_e32 v11, 0x400000, v6
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v6, v10, v11, vcc
+; CHECK-NEXT:    v_bfe_u32 v10, v7, 16, 1
+; CHECK-NEXT:    v_add3_u32 v10, v10, v7, s4
+; CHECK-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; CHECK-NEXT:    v_cndmask_b32_e32 v7, v10, v11, vcc
+; CHECK-NEXT:    s_mov_b32 s5, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v7, v7, v6, s5
+; CHECK-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; CHECK-NEXT:    v_add3_u32 v6, v6, v4, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v4
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v6, v10, vcc
+; CHECK-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; CHECK-NEXT:    v_add3_u32 v6, v6, v5, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v5, v6, v10, vcc
+; CHECK-NEXT:    v_perm_b32 v6, v5, v4, s5
+; CHECK-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v2, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; CHECK-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v3, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; CHECK-NEXT:    v_perm_b32 v5, v3, v2, s5
+; CHECK-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v2, v2, v0, s4
+; CHECK-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; CHECK-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; CHECK-NEXT:    v_add3_u32 v2, v2, v1, s4
+; CHECK-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; CHECK-NEXT:    v_perm_b32 v4, v1, v0, s5
+; CHECK-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <8 x float> %num to <8 x bfloat>
+  store <8 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v16(<16 x float> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v16:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_bfe_u32 v18, v6, 16, 1
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v18, v18, v6, s4
+; CHECK-NEXT:    v_or_b32_e32 v19, 0x400000, v6
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v6, v18, v19, vcc
+; CHECK-NEXT:    v_bfe_u32 v18, v7, 16, 1
+; CHECK-NEXT:    v_add3_u32 v18, v18, v7, s4
+; CHECK-NEXT:    v_or_b32_e32 v19, 0x400000, v7
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; CHECK-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc
+; CHECK-NEXT:    s_mov_b32 s5, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v7, v7, v6, s5
+; CHECK-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; CHECK-NEXT:    v_add3_u32 v6, v6, v4, s4
+; CHECK-NEXT:    v_or_b32_e32 v18, 0x400000, v4
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v6, v18, vcc
+; CHECK-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; CHECK-NEXT:    v_add3_u32 v6, v6, v5, s4
+; CHECK-NEXT:    v_or_b32_e32 v18, 0x400000, v5
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v5, v6, v18, vcc
+; CHECK-NEXT:    v_perm_b32 v6, v5, v4, s5
+; CHECK-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v2, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; CHECK-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v3, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; CHECK-NEXT:    v_perm_b32 v5, v3, v2, s5
+; CHECK-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v2, v2, v0, s4
+; CHECK-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; CHECK-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; CHECK-NEXT:    v_add3_u32 v2, v2, v1, s4
+; CHECK-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; CHECK-NEXT:    v_perm_b32 v4, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v0, v14, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v14, s4
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v14
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_bfe_u32 v1, v15, 16, 1
+; CHECK-NEXT:    v_add3_u32 v1, v1, v15, s4
+; CHECK-NEXT:    v_or_b32_e32 v2, 0x400000, v15
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT:    v_perm_b32 v3, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v0, v12, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v12, s4
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v12
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_bfe_u32 v1, v13, 16, 1
+; CHECK-NEXT:    v_add3_u32 v1, v1, v13, s4
+; CHECK-NEXT:    v_or_b32_e32 v2, 0x400000, v13
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT:    v_perm_b32 v2, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v0, v10, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v10, s4
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v10
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_bfe_u32 v1, v11, 16, 1
+; CHECK-NEXT:    v_add3_u32 v1, v1, v11, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v11
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
+; CHECK-NEXT:    v_perm_b32 v1, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v0, v8, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v8, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v8
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; CHECK-NEXT:    v_bfe_u32 v8, v9, 16, 1
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; CHECK-NEXT:    v_add3_u32 v8, v8, v9, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; CHECK-NEXT:    v_perm_b32 v0, v8, v0, s5
+; CHECK-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[16:17], v[4:7], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <16 x float> %num to <16 x bfloat>
+  store <16 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v32(<32 x float> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v32:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; CHECK-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; CHECK-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CHECK-NEXT:    v_bfe_u32 v34, v6, 16, 1
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v34, v34, v6, s4
+; CHECK-NEXT:    v_or_b32_e32 v35, 0x400000, v6
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v6, v34, v35, vcc
+; CHECK-NEXT:    v_bfe_u32 v34, v7, 16, 1
+; CHECK-NEXT:    v_add3_u32 v34, v34, v7, s4
+; CHECK-NEXT:    v_or_b32_e32 v35, 0x400000, v7
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; CHECK-NEXT:    v_cndmask_b32_e32 v7, v34, v35, vcc
+; CHECK-NEXT:    s_mov_b32 s5, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v7, v7, v6, s5
+; CHECK-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; CHECK-NEXT:    v_add3_u32 v6, v6, v4, s4
+; CHECK-NEXT:    v_or_b32_e32 v34, 0x400000, v4
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v6, v34, vcc
+; CHECK-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; CHECK-NEXT:    v_add3_u32 v6, v6, v5, s4
+; CHECK-NEXT:    v_or_b32_e32 v34, 0x400000, v5
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v5, v6, v34, vcc
+; CHECK-NEXT:    v_perm_b32 v6, v5, v4, s5
+; CHECK-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v2, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; CHECK-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v3, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; CHECK-NEXT:    v_perm_b32 v5, v3, v2, s5
+; CHECK-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v2, v2, v0, s4
+; CHECK-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; CHECK-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; CHECK-NEXT:    v_add3_u32 v2, v2, v1, s4
+; CHECK-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; CHECK-NEXT:    v_perm_b32 v4, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v0, v14, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v14, s4
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v14
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_bfe_u32 v1, v15, 16, 1
+; CHECK-NEXT:    v_add3_u32 v1, v1, v15, s4
+; CHECK-NEXT:    v_or_b32_e32 v2, 0x400000, v15
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT:    v_perm_b32 v3, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v0, v12, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v12, s4
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v12
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_bfe_u32 v1, v13, 16, 1
+; CHECK-NEXT:    v_add3_u32 v1, v1, v13, s4
+; CHECK-NEXT:    v_or_b32_e32 v2, 0x400000, v13
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT:    v_perm_b32 v2, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v0, v10, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v10, s4
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v10
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_bfe_u32 v1, v11, 16, 1
+; CHECK-NEXT:    v_add3_u32 v1, v1, v11, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v11
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
+; CHECK-NEXT:    v_perm_b32 v1, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v0, v8, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v8, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v8
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; CHECK-NEXT:    v_bfe_u32 v8, v9, 16, 1
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; CHECK-NEXT:    v_add3_u32 v8, v8, v9, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; CHECK-NEXT:    v_perm_b32 v0, v8, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v8, v22, 16, 1
+; CHECK-NEXT:    v_add3_u32 v8, v8, v22, s4
+; CHECK-NEXT:    v_or_b32_e32 v9, 0x400000, v22
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT:    v_bfe_u32 v9, v23, 16, 1
+; CHECK-NEXT:    v_add3_u32 v9, v9, v23, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v23
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
+; CHECK-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc
+; CHECK-NEXT:    v_perm_b32 v11, v9, v8, s5
+; CHECK-NEXT:    v_bfe_u32 v8, v20, 16, 1
+; CHECK-NEXT:    v_add3_u32 v8, v8, v20, s4
+; CHECK-NEXT:    v_or_b32_e32 v9, 0x400000, v20
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT:    v_bfe_u32 v9, v21, 16, 1
+; CHECK-NEXT:    v_add3_u32 v9, v9, v21, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v21
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
+; CHECK-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc
+; CHECK-NEXT:    v_perm_b32 v10, v9, v8, s5
+; CHECK-NEXT:    v_bfe_u32 v8, v18, 16, 1
+; CHECK-NEXT:    v_add3_u32 v8, v8, v18, s4
+; CHECK-NEXT:    v_or_b32_e32 v9, 0x400000, v18
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT:    v_bfe_u32 v9, v19, 16, 1
+; CHECK-NEXT:    v_add3_u32 v9, v9, v19, s4
+; CHECK-NEXT:    v_or_b32_e32 v12, 0x400000, v19
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
+; CHECK-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
+; CHECK-NEXT:    v_perm_b32 v9, v9, v8, s5
+; CHECK-NEXT:    v_bfe_u32 v8, v16, 16, 1
+; CHECK-NEXT:    v_add3_u32 v8, v8, v16, s4
+; CHECK-NEXT:    v_or_b32_e32 v12, 0x400000, v16
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc
+; CHECK-NEXT:    v_bfe_u32 v12, v17, 16, 1
+; CHECK-NEXT:    v_add3_u32 v12, v12, v17, s4
+; CHECK-NEXT:    v_or_b32_e32 v13, 0x400000, v17
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; CHECK-NEXT:    v_perm_b32 v8, v12, v8, s5
+; CHECK-NEXT:    v_bfe_u32 v12, v30, 16, 1
+; CHECK-NEXT:    v_add3_u32 v12, v12, v30, s4
+; CHECK-NEXT:    v_or_b32_e32 v13, 0x400000, v30
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_bfe_u32 v13, v31, 16, 1
+; CHECK-NEXT:    v_add3_u32 v13, v13, v31, s4
+; CHECK-NEXT:    v_or_b32_e32 v14, 0x400000, v31
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
+; CHECK-NEXT:    v_cndmask_b32_e32 v13, v13, v14, vcc
+; CHECK-NEXT:    v_perm_b32 v15, v13, v12, s5
+; CHECK-NEXT:    v_bfe_u32 v12, v28, 16, 1
+; CHECK-NEXT:    v_add3_u32 v12, v12, v28, s4
+; CHECK-NEXT:    v_or_b32_e32 v13, 0x400000, v28
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; CHECK-NEXT:    v_bfe_u32 v13, v29, 16, 1
+; CHECK-NEXT:    v_add3_u32 v13, v13, v29, s4
+; CHECK-NEXT:    v_or_b32_e32 v14, 0x400000, v29
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
+; CHECK-NEXT:    v_cndmask_b32_e32 v13, v13, v14, vcc
+; CHECK-NEXT:    v_perm_b32 v14, v13, v12, s5
+; CHECK-NEXT:    v_bfe_u32 v12, v26, 16, 1
+; CHECK-NEXT:    v_add3_u32 v12, v12, v26, s4
+; CHECK-NEXT:    v_or_b32_e32 v13, 0x400000, v26
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; CHECK-NEXT:    v_bfe_u32 v13, v27, 16, 1
+; CHECK-NEXT:    v_add3_u32 v13, v13, v27, s4
+; CHECK-NEXT:    v_or_b32_e32 v16, 0x400000, v27
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
+; CHECK-NEXT:    v_cndmask_b32_e32 v13, v13, v16, vcc
+; CHECK-NEXT:    v_perm_b32 v13, v13, v12, s5
+; CHECK-NEXT:    v_bfe_u32 v12, v24, 16, 1
+; CHECK-NEXT:    v_add3_u32 v12, v12, v24, s4
+; CHECK-NEXT:    v_or_b32_e32 v16, 0x400000, v24
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v16, vcc
+; CHECK-NEXT:    v_bfe_u32 v16, v25, 16, 1
+; CHECK-NEXT:    v_add3_u32 v16, v16, v25, s4
+; CHECK-NEXT:    v_or_b32_e32 v17, 0x400000, v25
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
+; CHECK-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
+; CHECK-NEXT:    v_perm_b32 v12, v16, v12, s5
+; CHECK-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off offset:48
+; CHECK-NEXT:    global_store_dwordx4 v[32:33], v[8:11], off offset:32
+; CHECK-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[32:33], v[4:7], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <32 x float> %num to <32 x bfloat>
+  store <32 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp64_to_bf16.ll b/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp64_to_bf16.ll
new file mode 100644
index 000000000000..d824763c22e2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp64_to_bf16.ll
@@ -0,0 +1,663 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s -o - | FileCheck %s
+
+define void @scalar(double %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: scalar:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; CHECK-NEXT:    v_and_b32_e32 v7, 1, v6
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v4, v6, v4
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT:    s_brev_b32 s4, 1
+; CHECK-NEXT:    v_and_or_b32 v5, v1, s4, v4
+; CHECK-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v4, v4, v5, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; CHECK-NEXT:    global_store_short_d16_hi v[2:3], v0, off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc double %num to bfloat
+  store bfloat %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v2(<2 x double> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v2:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v8, |v[0:1]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[6:7], v8
+; CHECK-NEXT:    v_and_b32_e32 v9, 1, v8
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[6:7]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[6:7]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v6, v8, v6
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; CHECK-NEXT:    s_brev_b32 s8, 1
+; CHECK-NEXT:    v_and_or_b32 v7, v1, s8, v6
+; CHECK-NEXT:    v_bfe_u32 v6, v6, 16, 1
+; CHECK-NEXT:    s_movk_i32 s9, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v6, v6, v7, s9
+; CHECK-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; CHECK-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v7, |v[2:3]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[0:1], v7
+; CHECK-NEXT:    v_and_b32_e32 v8, 1, v7
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, v[0:1]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[2:3]|, v[0:1]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v0, v7, v0
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; CHECK-NEXT:    v_and_or_b32 v1, v3, s8, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v1, s9
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    s_mov_b32 s4, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v0, v0, v6, s4
+; CHECK-NEXT:    global_store_dword v[4:5], v0, off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <2 x double> %num to <2 x bfloat>
+  store <2 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v3(<3 x double> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v3:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v10, |v[0:1]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[8:9], v10
+; CHECK-NEXT:    v_and_b32_e32 v11, 1, v10
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[8:9]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[8:9]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v8, v10, v8
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; CHECK-NEXT:    s_brev_b32 s8, 1
+; CHECK-NEXT:    v_and_or_b32 v9, v1, s8, v8
+; CHECK-NEXT:    v_bfe_u32 v8, v8, 16, 1
+; CHECK-NEXT:    s_movk_i32 s9, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v8, v8, v9, s9
+; CHECK-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v9, |v[2:3]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[0:1], v9
+; CHECK-NEXT:    v_and_b32_e32 v10, 1, v9
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, v[0:1]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[2:3]|, v[0:1]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v0, v9, v0
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; CHECK-NEXT:    v_and_or_b32 v1, v3, s8, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v1, s9
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    s_mov_b32 s4, 0x7060302
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v3, |v[4:5]|
+; CHECK-NEXT:    v_perm_b32 v2, v0, v8, s4
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
+; CHECK-NEXT:    v_and_b32_e32 v8, 1, v3
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[4:5]|, v[0:1]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[4:5]|, v[0:1]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v0, v3, v0
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; CHECK-NEXT:    v_and_or_b32 v1, v5, s8, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v1, s9
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    global_store_short_d16_hi v[6:7], v0, off offset:4
+; CHECK-NEXT:    global_store_dword v[6:7], v2, off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <3 x double> %num to <3 x bfloat>
+  store <3 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v4(<4 x double> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v4:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v12, |v[4:5]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[10:11], v12
+; CHECK-NEXT:    v_and_b32_e32 v13, 1, v12
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[4:5]|, v[10:11]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[4:5]|, v[10:11]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v13
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v10, v12, v10
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc
+; CHECK-NEXT:    s_brev_b32 s8, 1
+; CHECK-NEXT:    v_and_or_b32 v11, v5, s8, v10
+; CHECK-NEXT:    v_bfe_u32 v10, v10, 16, 1
+; CHECK-NEXT:    s_movk_i32 s9, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v10, v10, v11, s9
+; CHECK-NEXT:    v_or_b32_e32 v11, 0x400000, v11
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v11, |v[6:7]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[4:5], v11
+; CHECK-NEXT:    v_and_b32_e32 v12, 1, v11
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[6:7]|, v[4:5]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[6:7]|, v[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v4, v11, v4
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v11, vcc
+; CHECK-NEXT:    v_and_or_b32 v5, v7, s8, v4
+; CHECK-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v5, s9
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[6:7]
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; CHECK-NEXT:    s_mov_b32 s10, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v5, v4, v10, s10
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v4, |v[0:1]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
+; CHECK-NEXT:    v_and_b32_e32 v10, 1, v4
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[6:7]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[6:7]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v6, v4, v6
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; CHECK-NEXT:    v_and_or_b32 v6, v1, s8, v4
+; CHECK-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v6, s9
+; CHECK-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v6, |v[2:3]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[0:1], v6
+; CHECK-NEXT:    v_and_b32_e32 v7, 1, v6
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, v[0:1]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[2:3]|, v[0:1]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v0, v6, v0
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; CHECK-NEXT:    v_and_or_b32 v1, v3, s8, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v1, s9
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_perm_b32 v4, v0, v4, s10
+; CHECK-NEXT:    global_store_dwordx2 v[8:9], v[4:5], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <4 x double> %num to <4 x bfloat>
+  store <4 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v8(<8 x double> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v8:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v20, |v[12:13]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[18:19], v20
+; CHECK-NEXT:    v_and_b32_e32 v21, 1, v20
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[12:13]|, v[18:19]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[12:13]|, v[18:19]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v21
+; CHECK-NEXT:    v_cndmask_b32_e64 v18, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v18, v20, v18
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v18, v18, v20, vcc
+; CHECK-NEXT:    s_brev_b32 s8, 1
+; CHECK-NEXT:    v_and_or_b32 v19, v13, s8, v18
+; CHECK-NEXT:    v_bfe_u32 v18, v18, 16, 1
+; CHECK-NEXT:    s_movk_i32 s9, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v18, v18, v19, s9
+; CHECK-NEXT:    v_or_b32_e32 v19, 0x400000, v19
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[12:13]
+; CHECK-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v19, |v[14:15]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[12:13], v19
+; CHECK-NEXT:    v_and_b32_e32 v20, 1, v19
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[14:15]|, v[12:13]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[14:15]|, v[12:13]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v20
+; CHECK-NEXT:    v_cndmask_b32_e64 v12, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v12, v19, v12
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v19, vcc
+; CHECK-NEXT:    v_and_or_b32 v13, v15, s8, v12
+; CHECK-NEXT:    v_bfe_u32 v12, v12, 16, 1
+; CHECK-NEXT:    v_add3_u32 v12, v12, v13, s9
+; CHECK-NEXT:    v_or_b32_e32 v13, 0x400000, v13
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[14:15]
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; CHECK-NEXT:    s_mov_b32 s10, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v13, v12, v18, s10
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v12, |v[8:9]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[14:15], v12
+; CHECK-NEXT:    v_and_b32_e32 v18, 1, v12
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[8:9]|, v[14:15]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[8:9]|, v[14:15]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v18
+; CHECK-NEXT:    v_cndmask_b32_e64 v14, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v14, v12, v14
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v14, v12, vcc
+; CHECK-NEXT:    v_and_or_b32 v14, v9, s8, v12
+; CHECK-NEXT:    v_bfe_u32 v12, v12, 16, 1
+; CHECK-NEXT:    v_add3_u32 v12, v12, v14, s9
+; CHECK-NEXT:    v_or_b32_e32 v14, 0x400000, v14
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[8:9], v[8:9]
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v14, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v14, |v[10:11]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[8:9], v14
+; CHECK-NEXT:    v_and_b32_e32 v15, 1, v14
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[10:11]|, v[8:9]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[10:11]|, v[8:9]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v8, v14, v8
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v14, vcc
+; CHECK-NEXT:    v_and_or_b32 v9, v11, s8, v8
+; CHECK-NEXT:    v_bfe_u32 v8, v8, 16, 1
+; CHECK-NEXT:    v_add3_u32 v8, v8, v9, s9
+; CHECK-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[10:11]
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v10, |v[4:5]|
+; CHECK-NEXT:    v_perm_b32 v12, v8, v12, s10
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[8:9], v10
+; CHECK-NEXT:    v_and_b32_e32 v11, 1, v10
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[4:5]|, v[8:9]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[4:5]|, v[8:9]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v8, v10, v8
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; CHECK-NEXT:    v_and_or_b32 v9, v5, s8, v8
+; CHECK-NEXT:    v_bfe_u32 v8, v8, 16, 1
+; CHECK-NEXT:    v_add3_u32 v8, v8, v9, s9
+; CHECK-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v9, |v[6:7]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[4:5], v9
+; CHECK-NEXT:    v_and_b32_e32 v10, 1, v9
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[6:7]|, v[4:5]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[6:7]|, v[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v4, v9, v4
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
+; CHECK-NEXT:    v_and_or_b32 v5, v7, s8, v4
+; CHECK-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v5, s9
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[6:7]
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
+; CHECK-NEXT:    v_perm_b32 v11, v4, v8, s10
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; CHECK-NEXT:    v_and_b32_e32 v7, 1, v6
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v4, v6, v4
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT:    v_and_or_b32 v5, v1, s8, v4
+; CHECK-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v5, s9
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v5, |v[2:3]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[0:1], v5
+; CHECK-NEXT:    v_and_b32_e32 v6, 1, v5
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, v[0:1]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[2:3]|, v[0:1]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v0, v5, v0
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; CHECK-NEXT:    v_and_or_b32 v1, v3, s8, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v1, s9
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_perm_b32 v10, v0, v4, s10
+; CHECK-NEXT:    global_store_dwordx4 v[16:17], v[10:13], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <8 x double> %num to <8 x bfloat>
+  store <8 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v16(<16 x double> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v16:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; CHECK-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; CHECK-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v36, |v[12:13]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[34:35], v36
+; CHECK-NEXT:    v_and_b32_e32 v37, 1, v36
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[12:13]|, v[34:35]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[4:5], |v[12:13]|, v[34:35]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v37
+; CHECK-NEXT:    v_cndmask_b32_e64 v34, -1, 1, s[6:7]
+; CHECK-NEXT:    v_add_u32_e32 v34, v36, v34
+; CHECK-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v34, v34, v36, vcc
+; CHECK-NEXT:    s_brev_b32 s4, 1
+; CHECK-NEXT:    v_and_or_b32 v35, v13, s4, v34
+; CHECK-NEXT:    v_bfe_u32 v34, v34, 16, 1
+; CHECK-NEXT:    s_movk_i32 s5, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v34, v34, v35, s5
+; CHECK-NEXT:    v_or_b32_e32 v35, 0x400000, v35
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[12:13], v[12:13]
+; CHECK-NEXT:    v_cndmask_b32_e32 v34, v34, v35, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v35, |v[14:15]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[12:13], v35
+; CHECK-NEXT:    v_and_b32_e32 v36, 1, v35
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[8:9], |v[14:15]|, v[12:13]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[6:7], |v[14:15]|, v[12:13]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v36
+; CHECK-NEXT:    v_cndmask_b32_e64 v12, -1, 1, s[8:9]
+; CHECK-NEXT:    v_add_u32_e32 v12, v35, v12
+; CHECK-NEXT:    s_or_b64 vcc, s[6:7], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v35, vcc
+; CHECK-NEXT:    v_and_or_b32 v13, v15, s4, v12
+; CHECK-NEXT:    v_bfe_u32 v12, v12, 16, 1
+; CHECK-NEXT:    v_add3_u32 v12, v12, v13, s5
+; CHECK-NEXT:    v_or_b32_e32 v13, 0x400000, v13
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[14:15], v[14:15]
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; CHECK-NEXT:    s_mov_b32 s6, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v13, v12, v34, s6
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v12, |v[8:9]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[14:15], v12
+; CHECK-NEXT:    v_and_b32_e32 v34, 1, v12
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[8:9]|, v[14:15]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[8:9]|, v[14:15]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v34
+; CHECK-NEXT:    v_cndmask_b32_e64 v14, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v14, v12, v14
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v14, v12, vcc
+; CHECK-NEXT:    v_and_or_b32 v14, v9, s4, v12
+; CHECK-NEXT:    v_bfe_u32 v12, v12, 16, 1
+; CHECK-NEXT:    v_add3_u32 v12, v12, v14, s5
+; CHECK-NEXT:    v_or_b32_e32 v14, 0x400000, v14
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[8:9], v[8:9]
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v14, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v14, |v[10:11]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[8:9], v14
+; CHECK-NEXT:    v_and_b32_e32 v15, 1, v14
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[10:11]|, v[8:9]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[10:11]|, v[8:9]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v8, v14, v8
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v14, vcc
+; CHECK-NEXT:    v_and_or_b32 v9, v11, s4, v8
+; CHECK-NEXT:    v_bfe_u32 v8, v8, 16, 1
+; CHECK-NEXT:    v_add3_u32 v8, v8, v9, s5
+; CHECK-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[10:11], v[10:11]
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v10, |v[4:5]|
+; CHECK-NEXT:    v_perm_b32 v12, v8, v12, s6
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[8:9], v10
+; CHECK-NEXT:    v_and_b32_e32 v11, 1, v10
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[4:5]|, v[8:9]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[4:5]|, v[8:9]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v8, v10, v8
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; CHECK-NEXT:    v_and_or_b32 v9, v5, s4, v8
+; CHECK-NEXT:    v_bfe_u32 v8, v8, 16, 1
+; CHECK-NEXT:    v_add3_u32 v8, v8, v9, s5
+; CHECK-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v9, |v[6:7]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[4:5], v9
+; CHECK-NEXT:    v_and_b32_e32 v10, 1, v9
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[6:7]|, v[4:5]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[6:7]|, v[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v4, v9, v4
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
+; CHECK-NEXT:    v_and_or_b32 v5, v7, s4, v4
+; CHECK-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v5, s5
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[6:7], v[6:7]
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
+; CHECK-NEXT:    v_perm_b32 v11, v4, v8, s6
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; CHECK-NEXT:    v_and_b32_e32 v7, 1, v6
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[0:1]|, v[4:5]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[0:1]|, v[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v4, v6, v4
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT:    v_and_or_b32 v5, v1, s4, v4
+; CHECK-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v5, s5
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v5, |v[2:3]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[0:1], v5
+; CHECK-NEXT:    v_and_b32_e32 v6, 1, v5
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[2:3]|, v[0:1]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[2:3]|, v[0:1]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v0, v5, v0
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; CHECK-NEXT:    v_and_or_b32 v1, v3, s4, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v1, s5
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v2, |v[28:29]|
+; CHECK-NEXT:    v_perm_b32 v10, v0, v4, s6
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
+; CHECK-NEXT:    v_and_b32_e32 v3, 1, v2
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[28:29]|, v[0:1]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[28:29]|, v[0:1]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v0, v2, v0
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; CHECK-NEXT:    v_and_or_b32 v1, v29, s4, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v1, s5
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[28:29], v[28:29]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v3, |v[30:31]|
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
+; CHECK-NEXT:    v_and_b32_e32 v4, 1, v3
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[30:31]|, v[0:1]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[30:31]|, v[0:1]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v0, v3, v0
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; CHECK-NEXT:    v_and_or_b32 v1, v31, s4, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v1, s5
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[30:31], v[30:31]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_perm_b32 v3, v0, v2, s6
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v2, |v[24:25]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
+; CHECK-NEXT:    v_and_b32_e32 v4, 1, v2
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[24:25]|, v[0:1]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[24:25]|, v[0:1]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v0, v2, v0
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; CHECK-NEXT:    v_and_or_b32 v1, v25, s4, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v1, s5
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[24:25], v[24:25]
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v4, |v[26:27]|
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; CHECK-NEXT:    v_and_b32_e32 v5, 1, v4
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[26:27]|, v[0:1]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[26:27]|, v[0:1]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v0, v4, v0
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; CHECK-NEXT:    v_and_or_b32 v1, v27, s4, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v1, s5
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[26:27], v[26:27]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v4, |v[20:21]|
+; CHECK-NEXT:    v_perm_b32 v2, v0, v2, s6
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; CHECK-NEXT:    v_and_b32_e32 v5, 1, v4
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[20:21]|, v[0:1]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[20:21]|, v[0:1]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v0, v4, v0
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; CHECK-NEXT:    v_and_or_b32 v1, v21, s4, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v1, s5
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[20:21], v[20:21]
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v5, |v[22:23]|
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[0:1], v5
+; CHECK-NEXT:    v_and_b32_e32 v6, 1, v5
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[22:23]|, v[0:1]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[22:23]|, v[0:1]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v0, v5, v0
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; CHECK-NEXT:    v_and_or_b32 v1, v23, s4, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v1, s5
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[22:23], v[22:23]
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_perm_b32 v1, v0, v4, s6
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v0, |v[16:17]|
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[4:5], v0
+; CHECK-NEXT:    v_and_b32_e32 v6, 1, v0
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[16:17]|, v[4:5]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[16:17]|, v[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v4, v0, v4
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT:    v_and_or_b32 v4, v17, s4, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v4, s5
+; CHECK-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[16:17], v[16:17]
+; CHECK-NEXT:    v_cvt_f32_f64_e64 v6, |v[18:19]|
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; CHECK-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; CHECK-NEXT:    v_and_b32_e32 v7, 1, v6
+; CHECK-NEXT:    v_cmp_gt_f64_e64 s[10:11], |v[18:19]|, v[4:5]
+; CHECK-NEXT:    v_cmp_nlg_f64_e64 s[8:9], |v[18:19]|, v[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[10:11]
+; CHECK-NEXT:    v_add_u32_e32 v4, v6, v4
+; CHECK-NEXT:    s_or_b64 vcc, s[8:9], vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT:    v_and_or_b32 v5, v19, s4, v4
+; CHECK-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v5, s5
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; CHECK-NEXT:    v_cmp_u_f64_e32 vcc, v[18:19], v[18:19]
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; CHECK-NEXT:    v_perm_b32 v0, v4, v0, s6
+; CHECK-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[32:33], v[10:13], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <16 x double> %num to <16 x bfloat>
+  store <16 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx10.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx10.mir
new file mode 100644
index 000000000000..50eea4aebd5e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx10.mir
@@ -0,0 +1,34 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s
+
+---
+name: mimg_nsa
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-LABEL: name: mimg_nsa
+    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT: $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+    $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+...
+
+---
+name: mimg_nsa_mixed
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-LABEL: name: mimg_nsa_mixed
+    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT: $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
+    ; CHECK-NEXT: $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+    $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
+    $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+...
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
new file mode 100644
index 000000000000..b22de06e68a7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
@@ -0,0 +1,40 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s
+
+---
+name: mimg_nsa
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-LABEL: name: mimg_nsa
+    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr10_vgpr11, implicit-def $vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr20_vgpr21, implicit-def $vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
+    ; CHECK-NEXT:   S_CLAUSE 1
+    ; CHECK-NEXT:   $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT:   $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT: }
+    $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+    $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+...
+
+---
+name: mimg_nsa_mixed
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-LABEL: name: mimg_nsa_mixed
+    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr20_vgpr21_vgpr22, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr20_vgpr21, implicit-def $vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 {
+    ; CHECK-NEXT:   S_CLAUSE 2
+    ; CHECK-NEXT:   $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT:   $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx11 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
+    ; CHECK-NEXT:   $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT: }
+    $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+    $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx11 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
+    $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+...
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir
new file mode 100644
index 000000000000..243a84562ab3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir
@@ -0,0 +1,40 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s
+
+---
+name: mimg
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-LABEL: name: mimg
+    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr10_vgpr11, implicit-def $vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr20_vgpr21, implicit-def $vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
+    ; CHECK-NEXT:   S_CLAUSE 1
+    ; CHECK-NEXT:   $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT:   $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT: }
+    $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+    $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+...
+
+---
+name: mimg_mixed
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-LABEL: name: mimg_mixed
+    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr20_vgpr21_vgpr22, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr20_vgpr21, implicit-def $vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 {
+    ; CHECK-NEXT:   S_CLAUSE 2
+    ; CHECK-NEXT:   $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT:   $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
+    ; CHECK-NEXT:   $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT: }
+    $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+    $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
+    $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+...
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
index 1c6bdff51015..44b988a7121c 100644
--- a/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
@@ -1,6 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefix=GFX11
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefix=GFX12
 
 ---
 name: nop1
@@ -19,6 +20,12 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     ; GFX11-NEXT: S_NOP 2
+    ;
+    ; GFX12-LABEL: name: nop1
+    ; GFX12: liveins: $sgpr0_sgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    ; GFX12-NEXT: S_NOP 2
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     S_NOP 2
 ...
@@ -48,6 +55,16 @@ body: |
     ; GFX11-NEXT:   S_NOP 2
     ; GFX11-NEXT:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
     ; GFX11-NEXT: }
+    ;
+    ; GFX12-LABEL: name: nop2
+    ; GFX12: liveins: $sgpr0_sgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1 {
+    ; GFX12-NEXT:   S_CLAUSE 2
+    ; GFX12-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    ; GFX12-NEXT:   S_NOP 2
+    ; GFX12-NEXT:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
+    ; GFX12-NEXT: }
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     S_NOP 2
     $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
@@ -80,6 +97,17 @@ body: |
     ; GFX11-NEXT:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
     ; GFX11-NEXT: }
     ; GFX11-NEXT: S_NOP 2
+    ;
+    ; GFX12-LABEL: name: nop3
+    ; GFX12: liveins: $sgpr0_sgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1 {
+    ; GFX12-NEXT:   S_CLAUSE 2
+    ; GFX12-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    ; GFX12-NEXT:   S_NOP 2
+    ; GFX12-NEXT:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
+    ; GFX12-NEXT: }
+    ; GFX12-NEXT: S_NOP 2
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     S_NOP 2
     $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
@@ -274,6 +302,99 @@ body: |
     ; GFX11-NEXT:   $vgpr79 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 316, 0, 0, implicit $exec
     ; GFX11-NEXT:   $vgpr80 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 320, 0, 0, implicit $exec
     ; GFX11-NEXT: }
+    ;
+    ; GFX12-LABEL: name: long_clause
+    ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit-def $vgpr4, implicit-def $vgpr4_lo16, implicit-def $vgpr4_hi16, implicit-def $vgpr5, implicit-def $vgpr5_lo16, implicit-def $vgpr5_hi16, implicit-def $vgpr6, implicit-def $vgpr6_lo16, implicit-def $vgpr6_hi16, implicit-def $vgpr7, implicit-def $vgpr7_lo16, implicit-def $vgpr7_hi16, implicit-def $vgpr8, implicit-def $vgpr8_lo16, implicit-def $vgpr8_hi16, implicit-def $vgpr9, implicit-def $vgpr9_lo16, implicit-def $vgpr9_hi16, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr13, implicit-def $vgpr13_lo16, implicit-def $vgpr13_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit-def $vgpr17, implicit-def $vgpr17_lo16, implicit-def $vgpr17_hi16, implicit-def $vgpr18, implicit-def $vgpr18_lo16, implicit-def $vgpr18_hi16, implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr23, implicit-def $vgpr23_lo16, implicit-def $vgpr23_hi16, implicit-def $vgpr24, implicit-def $vgpr24_lo16, implicit-def $vgpr24_hi16, implicit-def $vgpr25, implicit-def $vgpr25_lo16, implicit-def $vgpr25_hi16, implicit-def $vgpr26, implicit-def $vgpr26_lo16, implicit-def $vgpr26_hi16, implicit-def $vgpr27, implicit-def $vgpr27_lo16, implicit-def $vgpr27_hi16, implicit-def $vgpr28, implicit-def $vgpr28_lo16, implicit-def $vgpr28_hi16, implicit-def $vgpr29, implicit-def $vgpr29_lo16, implicit-def $vgpr29_hi16, implicit-def $vgpr30, implicit-def $vgpr30_lo16, implicit-def $vgpr30_hi16, implicit-def $vgpr31, implicit-def $vgpr31_lo16, implicit-def $vgpr31_hi16, implicit-def $vgpr32, implicit-def $vgpr32_lo16, implicit-def $vgpr32_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+    ; GFX12-NEXT:   S_CLAUSE 31
+    ; GFX12-NEXT:   $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr4 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr5 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr6 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr7 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr8 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr9 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr10 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr11 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr12 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr13 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr14 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr15 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr16 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr17 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr18 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr19 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr20 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr21 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr22 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr23 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr24 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr25 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr26 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr27 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr28 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr29 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr30 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr31 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr32 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, implicit $exec
+    ; GFX12-NEXT: }
+    ; GFX12-NEXT: BUNDLE implicit-def $vgpr33, implicit-def $vgpr33_lo16, implicit-def $vgpr33_hi16, implicit-def $vgpr34, implicit-def $vgpr34_lo16, implicit-def $vgpr34_hi16, implicit-def $vgpr35, implicit-def $vgpr35_lo16, implicit-def $vgpr35_hi16, implicit-def $vgpr36, implicit-def $vgpr36_lo16, implicit-def $vgpr36_hi16, implicit-def $vgpr37, implicit-def $vgpr37_lo16, implicit-def $vgpr37_hi16, implicit-def $vgpr38, implicit-def $vgpr38_lo16, implicit-def $vgpr38_hi16, implicit-def $vgpr39, implicit-def $vgpr39_lo16, implicit-def $vgpr39_hi16, implicit-def $vgpr40, implicit-def $vgpr40_lo16, implicit-def $vgpr40_hi16, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+    ; GFX12-NEXT:   S_CLAUSE 31
+    ; GFX12-NEXT:   $vgpr33 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr34 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr35 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 140, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr36 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 144, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr37 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 148, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr38 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 152, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr39 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 156, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr40 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 160, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr41 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 164, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr42 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 168, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr43 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 172, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr44 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 176, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr45 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 180, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr46 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 184, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr47 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 188, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr48 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 192, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr49 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 196, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr50 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 200, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr51 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 204, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr52 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 208, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr53 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 212, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr54 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 216, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr55 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 220, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr56 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 224, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr57 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 228, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr58 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 232, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr59 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 236, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr60 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 240, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr61 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 244, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr62 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 248, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr63 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 252, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr64 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec
+    ; GFX12-NEXT: }
+    ; GFX12-NEXT: BUNDLE implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+    ; GFX12-NEXT:   S_CLAUSE 15
+    ; GFX12-NEXT:   $vgpr65 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 260, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr66 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 264, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr67 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 268, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr68 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 272, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr69 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 276, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr70 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 280, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr71 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 284, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr72 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 288, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr73 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 292, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr74 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 296, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr75 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 300, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr76 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 304, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr77 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 308, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr78 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 312, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr79 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 316, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr80 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 320, 0, 0, implicit $exec
+    ; GFX12-NEXT: }
     $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec
     $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec
     $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, implicit $exec
@@ -357,57 +478,6 @@ body: |
 ...
 
 ---
-name: mimg_nsa
-tracksRegLiveness: true
-body: |
-  bb.0:
-    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; CHECK-LABEL: name: mimg_nsa
-    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
-    ; CHECK-NEXT: $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
-    ;
-    ; GFX11-LABEL: name: mimg_nsa
-    ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr13, implicit-def $vgpr13_lo16, implicit-def $vgpr13_hi16, implicit-def $vgpr10_vgpr11, implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr11_vgpr12, implicit-def $vgpr11_vgpr12_vgpr13, implicit-def $vgpr12_vgpr13, implicit-def $vgpr20_vgpr21_vgpr22_vgpr23, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr23, implicit-def $vgpr23_lo16, implicit-def $vgpr23_hi16, implicit-def $vgpr20_vgpr21, implicit-def $vgpr20_vgpr21_vgpr22, implicit-def $vgpr21_vgpr22, implicit-def $vgpr21_vgpr22_vgpr23, implicit-def $vgpr22_vgpr23, implicit $vgpr3, implicit $vgpr8, implicit $vgpr7, implicit $vgpr5, implicit $vgpr4, implicit $vgpr6, implicit $vgpr0, implicit $vgpr2, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
-    ; GFX11-NEXT:   S_CLAUSE 1
-    ; GFX11-NEXT:   $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
-    ; GFX11-NEXT:   $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
-    ; GFX11-NEXT: }
-    $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
-    $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
-...
-
----
-name: mimg_nsa_mixed
-tracksRegLiveness: true
-body: |
-  bb.0:
-    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; CHECK-LABEL: name: mimg_nsa_mixed
-    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
-    ; CHECK-NEXT: $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
-    ; CHECK-NEXT: $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
-    ;
-    ; GFX11-LABEL: name: mimg_nsa_mixed
-    ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr13, implicit-def $vgpr13_lo16, implicit-def $vgpr13_hi16, implicit-def $vgpr10_vgpr11, implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr11_vgpr12, implicit-def $vgpr11_vgpr12_vgpr13, implicit-def $vgpr12_vgpr13, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr20_vgpr21_vgpr22_vgpr23, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr23, implicit-def $vgpr23_lo16, implicit-def $vgpr23_hi16, implicit-def $vgpr20_vgpr21, implicit-def $vgpr20_vgpr21_vgpr22, implicit-def $vgpr21_vgpr22, implicit-def $vgpr21_vgpr22_vgpr23, implicit-def $vgpr22_vgpr23, implicit $vgpr3, implicit $vgpr8, implicit $vgpr7, implicit $vgpr5, implicit $vgpr4, implicit $vgpr6, implicit $vgpr0, implicit $vgpr2, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 {
-    ; GFX11-NEXT:   S_CLAUSE 2
-    ; GFX11-NEXT:   $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
-    ; GFX11-NEXT:   $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
-    ; GFX11-NEXT:   $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
-    ; GFX11-NEXT: }
-    $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
-    $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
-    $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
-...
-
----
 name: kill
 tracksRegLiveness: true
 body: |
@@ -432,6 +502,16 @@ body: |
     ; GFX11-NEXT:   KILL undef renamable $sgpr4
     ; GFX11-NEXT:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
     ; GFX11-NEXT: }
+    ;
+    ; GFX12-LABEL: name: kill
+    ; GFX12: liveins: $sgpr0_sgpr1, $sgpr4
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1, implicit undef $sgpr4 {
+    ; GFX12-NEXT:   S_CLAUSE 1
+    ; GFX12-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    ; GFX12-NEXT:   KILL undef renamable $sgpr4
+    ; GFX12-NEXT:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
+    ; GFX12-NEXT: }
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     KILL undef renamable $sgpr4
     $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
@@ -464,6 +544,17 @@ body: |
     ; GFX11-NEXT:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
     ; GFX11-NEXT: }
     ; GFX11-NEXT: KILL undef renamable $sgpr5
+    ;
+    ; GFX12-LABEL: name: kill2
+    ; GFX12: liveins: $sgpr0_sgpr1, $sgpr4, $sgpr5
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1, implicit undef $sgpr4 {
+    ; GFX12-NEXT:   S_CLAUSE 1
+    ; GFX12-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    ; GFX12-NEXT:   KILL undef renamable $sgpr4
+    ; GFX12-NEXT:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
+    ; GFX12-NEXT: }
+    ; GFX12-NEXT: KILL undef renamable $sgpr5
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     KILL undef renamable $sgpr4
     $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
@@ -490,6 +581,12 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX11-NEXT: $vgpr4 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
+    ;
+    ; GFX12-LABEL: name: flat_load_atomic
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; GFX12-NEXT: $vgpr4 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
     $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     $vgpr4 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
 ...
@@ -514,6 +611,12 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     ; GFX11-NEXT: $vgpr4 = GLOBAL_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
+    ;
+    ; GFX12-LABEL: name: global_load_atomic
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX12-NEXT: $vgpr4 = GLOBAL_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
     $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     $vgpr4 = GLOBAL_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
 ...
@@ -535,6 +638,12 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX11-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec, implicit $flat_scr
+    ;
+    ; GFX12-LABEL: name: flat_global_load
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; GFX12-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec, implicit $flat_scr
     $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec, implicit $flat_scr
 ...
@@ -559,6 +668,12 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
     ; GFX11-NEXT: $vgpr0 = BUFFER_ATOMIC_ADD_OFFSET_RTN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 4, 0, 0, implicit $exec
+    ;
+    ; GFX12-LABEL: name: buffer_load_atomic
+    ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+    ; GFX12-NEXT: $vgpr0 = BUFFER_ATOMIC_ADD_OFFSET_RTN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 4, 0, 0, implicit $exec
     $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
     $vgpr0 = BUFFER_ATOMIC_ADD_OFFSET_RTN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 4, 0, 0, implicit $exec
 ...
@@ -580,6 +695,12 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX11-NEXT: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
+    ;
+    ; GFX12-LABEL: name: flat_load_store
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; GFX12-NEXT: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
     $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
 ...
@@ -601,6 +722,12 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     ; GFX11-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
+    ;
+    ; GFX12-LABEL: name: global_load_store
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX12-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
     $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
 ...
@@ -622,6 +749,12 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
     ; GFX11-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec
+    ;
+    ; GFX12-LABEL: name: buffer_load_store
+    ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+    ; GFX12-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec
     $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
     BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/pal-userdata-regs.ll b/llvm/test/CodeGen/AMDGPU/pal-userdata-regs.ll
index 6d043e2b6b0a..591deda611b2 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-userdata-regs.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-userdata-regs.ll
@@ -4,12 +4,12 @@
 ; full tessellation-and-geometry pipeline, compiled on gfx8 so it uses all six
 ; hardware shader types.
 
-; CHECK-DAG: 0x2c0c (SPI_SHADER_USER_DATA_PS_0): 0x10000000
-; CHECK-DAG: 0x2c4c (SPI_SHADER_USER_DATA_VS_0): 0x10000000
-; CHECK-DAG: 0x2c8c (SPI_SHADER_USER_DATA_GS_0): 0x10000000
-; CHECK-DAG: 0x2ccc (SPI_SHADER_USER_DATA_ES_0): 0x10000000
-; CHECK-DAG: 0x2d0c (SPI_SHADER_USER_DATA_HS_0): 0x10000000
-; CHECK-DAG: 0x2d4c (SPI_SHADER_USER_DATA_LS_0): 0x10000000
+; CHECK-DAG: '0x2c0c (SPI_SHADER_USER_DATA_PS_0)': 0x10000000
+; CHECK-DAG: '0x2c4c (SPI_SHADER_USER_DATA_VS_0)': 0x10000000
+; CHECK-DAG: '0x2c8c (SPI_SHADER_USER_DATA_GS_0)': 0x10000000
+; CHECK-DAG: '0x2ccc (SPI_SHADER_USER_DATA_ES_0)': 0x10000000
+; CHECK-DAG: '0x2d0c (SPI_SHADER_USER_DATA_HS_0)': 0x10000000
+; CHECK-DAG: '0x2d4c (SPI_SHADER_USER_DATA_LS_0)': 0x10000000
 
 !amdgpu.pal.metadata.msgpack = !{!0}
 
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index 6ba66ccf7186..b068d87c4d6f 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -242,130 +242,137 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0:       ; %bb.0: ; %_udiv-special-cases
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v2
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v15
-; GFX9-O0-NEXT:    v_ashrrev_i64 v[12:13], s4, v[6:7]
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[11:12], s4, v[10:11]
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v13
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v9
-; GFX9-O0-NEXT:    v_ashrrev_i64 v[6:7], s4, v[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v13
-; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v1, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
-; GFX9-O0-NEXT:    v_xor_b32_e64 v13, v11, v12
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v6
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[15:16], s4, v[13:14]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v12
+; GFX9-O0-NEXT:    v_xor_b32_e64 v3, v3, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v11
+; GFX9-O0-NEXT:    v_xor_b32_e64 v13, v8, v12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v1, v10
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT:    v_xor_b32_e64 v15, v4, v12
-; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v7
-; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v1, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v5, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_xor_b32_e64 v3, v3, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v1, v12
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v16
+; GFX9-O0-NEXT:    v_xor_b32_e64 v9, v8, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v15
+; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v7, v6
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v1, v4
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_xor_b32_e64 v2, v2, v6
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
+; GFX9-O0-NEXT:    v_xor_b32_e64 v9, v9, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_xor_b32_e64 v4, v4, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v14
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v9, vcc, v9, v12
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v10, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v10, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v13, vcc, v11, v12, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v10, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v10, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
-; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v1, vcc, v1, v6
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v3, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v5, v6, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v3, vcc, v2, v3, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -438,7 +445,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[12:13], v[11:12], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s[12:13]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
@@ -474,7 +482,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[13:14], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
@@ -589,27 +598,27 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_readlane_b32 s5, v0, 5
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:  ; %bb.2: ; %Flow
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_5
 ; GFX9-O0-NEXT:  .LBB0_3: ; %Flow2
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
@@ -624,22 +633,22 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_9
 ; GFX9-O0-NEXT:  .LBB0_4: ; %udiv-loop-exit
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 1
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s4, v[0:1]
@@ -679,27 +688,27 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 6
 ; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 7
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_4
 ; GFX9-O0-NEXT:  .LBB0_6: ; %udiv-do-while
 ; GFX9-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -709,30 +718,30 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_readlane_b32 s6, v16, 8
 ; GFX9-O0-NEXT:    v_readlane_b32 s7, v16, 9
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[29:30], s4, v[2:3]
@@ -872,24 +881,24 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v2
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v0
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v15
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v14
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v13
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v12
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 4
 ; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 5
@@ -899,42 +908,42 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:    s_cbranch_execnz .LBB0_6
 ; GFX9-O0-NEXT:    s_branch .LBB0_1
 ; GFX9-O0-NEXT:  .LBB0_7: ; %udiv-preheader
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
@@ -1018,12 +1027,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s8
@@ -1034,30 +1043,30 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_6
 ; GFX9-O0-NEXT:  .LBB0_8: ; %udiv-bb1
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
@@ -1099,14 +1108,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0x7f
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v3, s4, v4
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[5:6], v3, v[11:12]
@@ -1152,12 +1161,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v3
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v10
 ; GFX9-O0-NEXT:    v_or_b32_e64 v3, v3, v4
@@ -1172,18 +1181,18 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
@@ -1203,18 +1212,18 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 32
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[5:6]
@@ -1486,11 +1495,11 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX9-O0-NEXT:    ; kill: killed $vgpr4
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll b/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
index e73235857728..29520cb7468c 100644
--- a/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
@@ -21,10 +21,10 @@
 ; VI-NEXT:          .vgpr_count:     0x5
 ; GFX9-NEXT:        .vgpr_count:     0x5
 ; GCN-NEXT:     .registers:
-; SI-NEXT:        0x2e12 (COMPUTE_PGM_RSRC1): 0x{{[0-9a-f]*}}81
-; VI-NEXT:        0x2e12 (COMPUTE_PGM_RSRC1): 0x{{[0-9a-f]*}}c1
-; GFX9-NEXT:      0x2e12 (COMPUTE_PGM_RSRC1): 0x{{[0-9a-f]*}}81
-; GCN-NEXT:       0x2e13 (COMPUTE_PGM_RSRC2): 0
+; SI-NEXT:        '0x2e12 (COMPUTE_PGM_RSRC1)': 0x{{[0-9a-f]*}}81
+; VI-NEXT:        '0x2e12 (COMPUTE_PGM_RSRC1)': 0x{{[0-9a-f]*}}c1
+; GFX9-NEXT:      '0x2e12 (COMPUTE_PGM_RSRC1)': 0x{{[0-9a-f]*}}81
+; GCN-NEXT:       '0x2e13 (COMPUTE_PGM_RSRC2)': 0
 ; GCN-NEXT: ...
 ; GCN-NEXT:         .end_amdgpu_pal_metadata
 
diff --git a/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll b/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll
index be741f536ac7..528bfe041173 100644
--- a/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll
+++ b/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll
@@ -46,12 +46,10 @@ define float @fminnum32_intrinsic(float %x, float %y) {
 define float @fminnum32_nsz_intrinsic(float %x, float %y) {
 ; ARMV7-LABEL: fminnum32_nsz_intrinsic:
 ; ARMV7:       @ %bb.0:
-; ARMV7-NEXT:    vmov s0, r0
-; ARMV7-NEXT:    vmov s2, r1
-; ARMV7-NEXT:    vcmp.f32 s0, s2
-; ARMV7-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT:    vmovlt.f32 s2, s0
-; ARMV7-NEXT:    vmov r0, s2
+; ARMV7-NEXT:    vmov s0, r1
+; ARMV7-NEXT:    vmov s2, r0
+; ARMV7-NEXT:    vmin.f32 d0, d1, d0
+; ARMV7-NEXT:    vmov r0, s0
 ; ARMV7-NEXT:    bx lr
 ;
 ; ARMV8-LABEL: fminnum32_nsz_intrinsic:
@@ -78,9 +76,7 @@ define float @fminnum32_non_zero_intrinsic(float %x) {
 ; ARMV7:       @ %bb.0:
 ; ARMV7-NEXT:    vmov.f32 s0, #-1.000000e+00
 ; ARMV7-NEXT:    vmov s2, r0
-; ARMV7-NEXT:    vcmp.f32 s2, s0
-; ARMV7-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT:    vmovlt.f32 s0, s2
+; ARMV7-NEXT:    vmin.f32 d0, d1, d0
 ; ARMV7-NEXT:    vmov r0, s0
 ; ARMV7-NEXT:    bx lr
 ;
@@ -136,12 +132,10 @@ define float @fmaxnum32_intrinsic(float %x, float %y) {
 define float @fmaxnum32_nsz_intrinsic(float %x, float %y) {
 ; ARMV7-LABEL: fmaxnum32_nsz_intrinsic:
 ; ARMV7:       @ %bb.0:
-; ARMV7-NEXT:    vmov s0, r0
-; ARMV7-NEXT:    vmov s2, r1
-; ARMV7-NEXT:    vcmp.f32 s0, s2
-; ARMV7-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT:    vmovgt.f32 s2, s0
-; ARMV7-NEXT:    vmov r0, s2
+; ARMV7-NEXT:    vmov s0, r1
+; ARMV7-NEXT:    vmov s2, r0
+; ARMV7-NEXT:    vmax.f32 d0, d1, d0
+; ARMV7-NEXT:    vmov r0, s0
 ; ARMV7-NEXT:    bx lr
 ;
 ; ARMV8-LABEL: fmaxnum32_nsz_intrinsic:
@@ -210,9 +204,7 @@ define float @fmaxnum32_non_zero_intrinsic(float %x) {
 ; ARMV7:       @ %bb.0:
 ; ARMV7-NEXT:    vmov.f32 s0, #1.000000e+00
 ; ARMV7-NEXT:    vmov s2, r0
-; ARMV7-NEXT:    vcmp.f32 s2, s0
-; ARMV7-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT:    vmovgt.f32 s0, s2
+; ARMV7-NEXT:    vmax.f32 d0, d1, d0
 ; ARMV7-NEXT:    vmov r0, s0
 ; ARMV7-NEXT:    bx lr
 ;
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll
index 8ca8a6602737..024ed04f6e5e 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll
@@ -108,8 +108,8 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   20
 ; CHECK-NEXT:        .long   20
-; CHECK-NEXT:        .long   124
-; CHECK-NEXT:        .long   144
+; CHECK-NEXT:        .long   108
+; CHECK-NEXT:        .long   128
 ; CHECK-NEXT:        .long   28
 ; CHECK-NEXT:        .long   8                       # FuncInfo
 
diff --git a/llvm/test/CodeGen/LoongArch/sextw-removal.ll b/llvm/test/CodeGen/LoongArch/sextw-removal.ll
new file mode 100644
index 000000000000..6db9c1608b3c
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/sextw-removal.ll
@@ -0,0 +1,921 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mtriple=loongarch64 | FileCheck %s --check-prefixes=CHECK
+
+define void @test1(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -32
+; CHECK-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    move $fp, $a1
+; CHECK-NEXT:    sra.w $s0, $a0, $a1
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB0_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $s0, 0
+; CHECK-NEXT:    bl %plt(bar)
+; CHECK-NEXT:    sll.w $s0, $s0, $fp
+; CHECK-NEXT:    bnez $a0, .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 32
+; CHECK-NEXT:    ret
+bb:
+  %i = ashr i32 %arg, %arg1
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i5, %bb2 ]
+  %i4 = tail call signext i32 @bar(i32 signext %i3)
+  %i5 = shl i32 %i3, %arg1
+  %i6 = icmp eq i32 %i4, 0
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+
+declare signext i32 @bar(i32 signext)
+
+define signext i32 @test2(ptr %p, i32 signext %b) nounwind {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    ori $a2, $zero, 1
+; CHECK-NEXT:    sll.w $a1, $a2, $a1
+; CHECK-NEXT:    andn $a0, $a0, $a1
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+  %a = load i32, ptr %p
+  %shl = shl i32 1, %b
+  %neg = xor i32 %shl, -1
+  %and1 = and i32 %neg, %a
+  ret i32 %and1
+}
+
+define signext i32 @test3(ptr %p, i32 signext %b) nounwind {
+; CHECK-LABEL: test3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    ori $a2, $zero, 1
+; CHECK-NEXT:    sll.w $a1, $a2, $a1
+; CHECK-NEXT:    orn $a0, $a0, $a1
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+  %a = load i32, ptr %p
+  %shl = shl i32 1, %b
+  %neg = xor i32 %shl, -1
+  %and1 = or i32 %neg, %a
+  ret i32 %and1
+}
+
+define signext i32 @test4(ptr %p, i32 signext %b) nounwind {
+; CHECK-LABEL: test4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    ori $a2, $zero, 1
+; CHECK-NEXT:    sll.w $a1, $a2, $a1
+; CHECK-NEXT:    xor $a0, $a1, $a0
+; CHECK-NEXT:    nor $a0, $a0, $zero
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+  %a = load i32, ptr %p
+  %shl = shl i32 1, %b
+  %neg = xor i32 %shl, -1
+  %and1 = xor i32 %neg, %a
+  ret i32 %and1
+}
+
+define void @test5(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test5:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -48
+; CHECK-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s1, $sp, 16 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s2, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    sra.w $a1, $a0, $a1
+; CHECK-NEXT:    lu12i.w $a0, 349525
+; CHECK-NEXT:    ori $fp, $a0, 1365
+; CHECK-NEXT:    lu12i.w $a0, 209715
+; CHECK-NEXT:    ori $s0, $a0, 819
+; CHECK-NEXT:    lu12i.w $a0, 61680
+; CHECK-NEXT:    ori $s1, $a0, 3855
+; CHECK-NEXT:    lu12i.w $a0, 4112
+; CHECK-NEXT:    ori $s2, $a0, 257
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB4_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $a1, 0
+; CHECK-NEXT:    bl %plt(bar)
+; CHECK-NEXT:    srli.d $a1, $a0, 1
+; CHECK-NEXT:    and $a1, $a1, $fp
+; CHECK-NEXT:    sub.d $a1, $a0, $a1
+; CHECK-NEXT:    and $a2, $a1, $s0
+; CHECK-NEXT:    srli.d $a1, $a1, 2
+; CHECK-NEXT:    and $a1, $a1, $s0
+; CHECK-NEXT:    add.d $a1, $a2, $a1
+; CHECK-NEXT:    srli.d $a2, $a1, 4
+; CHECK-NEXT:    add.d $a1, $a1, $a2
+; CHECK-NEXT:    and $a1, $a1, $s1
+; CHECK-NEXT:    mul.d $a1, $a1, $s2
+; CHECK-NEXT:    bstrpick.d $a1, $a1, 31, 24
+; CHECK-NEXT:    bnez $a0, .LBB4_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $s2, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 48
+; CHECK-NEXT:    ret
+bb:
+  %i = ashr i32 %arg, %arg1
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i5, %bb2 ]
+  %i4 = tail call signext i32 @bar(i32 signext %i3)
+  %i5 = tail call i32 @llvm.ctpop.i32(i32 %i4)
+  %i6 = icmp eq i32 %i4, 0
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+
+declare i32 @llvm.ctpop.i32(i32)
+
+define void @test6(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test6:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -32
+; CHECK-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    sra.w $fp, $a0, $a1
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB5_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $fp, 0
+; CHECK-NEXT:    bl %plt(baz)
+; CHECK-NEXT:    bstrpick.d $s0, $a0, 31, 0
+; CHECK-NEXT:    move $a0, $s0
+; CHECK-NEXT:    bl %plt(__fixsfsi)
+; CHECK-NEXT:    move $fp, $a0
+; CHECK-NEXT:    move $a0, $s0
+; CHECK-NEXT:    move $a1, $zero
+; CHECK-NEXT:    bl %plt(__nesf2)
+; CHECK-NEXT:    bnez $a0, .LBB5_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 32
+; CHECK-NEXT:    ret
+bb:
+  %i = ashr i32 %arg, %arg1
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i5, %bb2 ]
+  %i4 = tail call float @baz(i32 signext %i3)
+  %i5 = fptosi float %i4 to i32
+  %i6 = fcmp oeq float %i4, zeroinitializer
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+declare float @baz(i32 signext %i3)
+
+define void @test7(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test7:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -48
+; CHECK-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s1, $sp, 16 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s2, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    sra.w $a0, $a0, $a1
+; CHECK-NEXT:    lu12i.w $a1, 349525
+; CHECK-NEXT:    ori $a1, $a1, 1365
+; CHECK-NEXT:    lu32i.d $a1, 349525
+; CHECK-NEXT:    lu52i.d $fp, $a1, 1365
+; CHECK-NEXT:    lu12i.w $a1, 209715
+; CHECK-NEXT:    ori $a1, $a1, 819
+; CHECK-NEXT:    lu32i.d $a1, 209715
+; CHECK-NEXT:    lu52i.d $s0, $a1, 819
+; CHECK-NEXT:    lu12i.w $a1, 61680
+; CHECK-NEXT:    ori $a1, $a1, 3855
+; CHECK-NEXT:    lu32i.d $a1, -61681
+; CHECK-NEXT:    lu52i.d $s1, $a1, 240
+; CHECK-NEXT:    lu12i.w $a1, 4112
+; CHECK-NEXT:    ori $a1, $a1, 257
+; CHECK-NEXT:    lu32i.d $a1, 65793
+; CHECK-NEXT:    lu52i.d $s2, $a1, 16
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB6_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    bl %plt(foo)
+; CHECK-NEXT:    srli.d $a1, $a0, 1
+; CHECK-NEXT:    and $a1, $a1, $fp
+; CHECK-NEXT:    sub.d $a0, $a0, $a1
+; CHECK-NEXT:    and $a1, $a0, $s0
+; CHECK-NEXT:    srli.d $a0, $a0, 2
+; CHECK-NEXT:    and $a0, $a0, $s0
+; CHECK-NEXT:    add.d $a0, $a1, $a0
+; CHECK-NEXT:    srli.d $a1, $a0, 4
+; CHECK-NEXT:    add.d $a0, $a0, $a1
+; CHECK-NEXT:    and $a0, $a0, $s1
+; CHECK-NEXT:    mul.d $a0, $a0, $s2
+; CHECK-NEXT:    srli.d $a0, $a0, 56
+; CHECK-NEXT:    bnez $a0, .LBB6_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $s2, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 48
+; CHECK-NEXT:    ret
+bb:
+  %i = ashr i32 %arg, %arg1
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i6, %bb2 ]
+  %i4 = tail call signext i64 @foo(i32 signext %i3)
+  %i5 = tail call i64 @llvm.ctpop.i64(i64 %i4)
+  %i6 = trunc i64 %i5 to i32
+  %i7 = icmp eq i32 %i6, 0
+  br i1 %i7, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+
+declare i64 @llvm.ctpop.i64(i64)
+
+define void @test8(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test8:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -16
+; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; CHECK-NEXT:    sra.w $a0, $a0, $a1
+; CHECK-NEXT:    addi.w $fp, $zero, -256
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB7_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    bl %plt(foo)
+; CHECK-NEXT:    or $a0, $a0, $fp
+; CHECK-NEXT:    bnez $a0, .LBB7_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 16
+; CHECK-NEXT:    ret
+bb:
+  %i = ashr i32 %arg, %arg1
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i6, %bb2 ]
+  %i4 = tail call signext i64 @foo(i32 signext %i3)
+  %i5 = or i64 %i4, -256
+  %i6 = trunc i64 %i5 to i32
+  %i7 = icmp eq i32 %i6, 0
+  br i1 %i7, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+
+declare i64 @foo(i32 signext)
+
+define void @test9(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test9:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -16
+; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; CHECK-NEXT:    sra.w $a1, $a0, $a1
+; CHECK-NEXT:    ori $fp, $zero, 254
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB8_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $a1, 0
+; CHECK-NEXT:    bl %plt(bar)
+; CHECK-NEXT:    slti $a1, $a0, 255
+; CHECK-NEXT:    blt $fp, $a0, .LBB8_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 16
+; CHECK-NEXT:    ret
+bb:
+  %i = ashr i32 %arg, %arg1
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i7, %bb2 ]
+  %i4 = tail call signext i32 @bar(i32 signext %i3)
+  %i5 = icmp slt i32 %i4, 255
+  %i6 = sext i1 %i5 to i32
+  %i7 = sub i32 0, %i6
+  br i1 %i5, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+
+define void @test10(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test10:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -16
+; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; CHECK-NEXT:    sra.w $fp, $a0, $a1
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB9_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $fp, 0
+; CHECK-NEXT:    bl %plt(baz)
+; CHECK-NEXT:    move $fp, $a0
+; CHECK-NEXT:    bstrpick.d $a0, $a0, 31, 0
+; CHECK-NEXT:    move $a1, $zero
+; CHECK-NEXT:    bl %plt(__nesf2)
+; CHECK-NEXT:    bnez $a0, .LBB9_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 16
+; CHECK-NEXT:    ret
+bb:
+  %i = ashr i32 %arg, %arg1
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i5, %bb2 ]
+  %i4 = tail call float @baz(i32 signext %i3)
+  %i5 = bitcast float %i4 to i32
+  %i6 = fcmp oeq float %i4, zeroinitializer
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+
+define signext i32 @test11(i64 %arg1, i64 %arg2, i64 %arg3)  {
+; CHECK-LABEL: test11:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi.d $a2, $a2, -1
+; CHECK-NEXT:    ori $a3, $zero, 256
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB10_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    andi $a0, $a0, 1234
+; CHECK-NEXT:    addi.d $a2, $a2, 1
+; CHECK-NEXT:    add.d $a0, $a0, $a1
+; CHECK-NEXT:    bltu $a2, $a3, .LBB10_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %entry
+  %i1 = phi i64 [ %arg1, %entry ], [ %i5, %bb2 ]
+  %i2 = phi i64 [ %arg3, %entry ], [ %i3, %bb2 ]
+  %i3 = add i64 %i2, 1
+  %i4 = and i64 %i1, 1234
+  %i5 = add i64 %i4, %arg2
+  %i6 = icmp ugt i64 %i2, 255
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  %i7 = trunc i64 %i5 to i32
+  ret i32 %i7
+}
+
+define signext i32 @test12(i64 %arg1, i64 %arg2, i64 %arg3)  {
+; CHECK-LABEL: test12:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi.d $a2, $a2, -1
+; CHECK-NEXT:    ori $a3, $zero, 256
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB11_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    xor $a0, $a0, $a1
+; CHECK-NEXT:    mul.d $a4, $a0, $a1
+; CHECK-NEXT:    add.d $a0, $a0, $a4
+; CHECK-NEXT:    and $a4, $a4, $a0
+; CHECK-NEXT:    addi.d $a2, $a2, 1
+; CHECK-NEXT:    add.d $a0, $a4, $a1
+; CHECK-NEXT:    bltu $a2, $a3, .LBB11_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    addi.w $a0, $a4, 0
+; CHECK-NEXT:    ret
+entry:
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %entry
+  %i1 = phi i64 [ %arg1, %entry ], [ %i6, %bb2 ]
+  %i2 = phi i64 [ %arg3, %entry ], [ %i3, %bb2 ]
+  %i3 = add i64 %i2, 1
+  %i4 = xor i64 %i1, %arg2
+  %i5 = mul i64 %i4, %arg2
+  %i9 = add i64 %i4, %i5
+  %i8 = and i64 %i5, %i9
+  %i6 = add i64 %i8, %arg2
+  %i7 = icmp ugt i64 %i2, 255
+  br i1 %i7, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  %r = trunc i64 %i8 to i32
+  ret i32 %r
+}
+
+define signext i32 @test13(i64 %arg1, i64 %arg2, i64 %arg3)  {
+; CHECK-LABEL: test13:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi.d $a2, $a2, -1
+; CHECK-NEXT:    ori $a3, $zero, 256
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB12_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    div.d $a0, $a0, $a1
+; CHECK-NEXT:    addi.d $a2, $a2, 1
+; CHECK-NEXT:    add.d $a0, $a0, $a1
+; CHECK-NEXT:    bltu $a2, $a3, .LBB12_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %entry
+  %i1 = phi i64 [ %arg1, %entry ], [ %i5, %bb2 ]
+  %i2 = phi i64 [ %arg3, %entry ], [ %i3, %bb2 ]
+  %i3 = add i64 %i2, 1
+  %i4 = sdiv i64 %i1, %arg2
+  %i5 = add i64 %i4, %arg2
+  %i6 = icmp ugt i64 %i2, 255
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  %i8 = trunc i64 %i5 to i32
+  ret i32 %i8
+}
+
+
+define signext i32 @test14(i32 signext %0, i32 signext %1) {
+; CHECK-LABEL: test14:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori $a2, $zero, 2
+; CHECK-NEXT:    blt $a1, $a2, .LBB13_4
+; CHECK-NEXT:  # %bb.1: # %.preheader
+; CHECK-NEXT:    ori $a3, $zero, 1
+; CHECK-NEXT:    addi.w $a2, $zero, -1
+; CHECK-NEXT:    lu32i.d $a2, 0
+; CHECK-NEXT:    ori $a4, $zero, 1000
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB13_2: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a5, $a0, 0
+; CHECK-NEXT:    blt $a4, $a5, .LBB13_5
+; CHECK-NEXT:  # %bb.3: # in Loop: Header=BB13_2 Depth=1
+; CHECK-NEXT:    add.d $a0, $a3, $a0
+; CHECK-NEXT:    addi.w $a3, $a3, 1
+; CHECK-NEXT:    blt $a3, $a1, .LBB13_2
+; CHECK-NEXT:  .LBB13_4:
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB13_5:
+; CHECK-NEXT:    addi.w $a0, $a2, 0
+; CHECK-NEXT:    ret
+  %3 = icmp sgt i32 %1, 1
+  br i1 %3, label %4, label %12
+
+4:                                                ; preds = %2, %8
+  %5 = phi i32 [ %10, %8 ], [ 1, %2 ]
+  %6 = phi i32 [ %9, %8 ], [ %0, %2 ]
+  %7 = icmp sgt i32 %6, 1000
+  br i1 %7, label %12, label %8
+
+8:                                                ; preds = %4
+  %9 = add nsw i32 %5, %6
+  %10 = add nuw nsw i32 %5, 1
+  %11 = icmp slt i32 %10, %1
+  br i1 %11, label %4, label %12
+
+12:                                               ; preds = %8, %4, %2
+  %13 = phi i32 [ %0, %2 ], [ -1, %4 ], [ %9, %8 ]
+  ret i32 %13
+}
+
+define signext i32 @test14b(i32 %0, i32 signext %1) {
+; CHECK-LABEL: test14b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori $a2, $zero, 2
+; CHECK-NEXT:    blt $a1, $a2, .LBB14_4
+; CHECK-NEXT:  # %bb.1: # %.preheader
+; CHECK-NEXT:    ori $a3, $zero, 1
+; CHECK-NEXT:    addi.w $a2, $zero, -1
+; CHECK-NEXT:    lu32i.d $a2, 0
+; CHECK-NEXT:    ori $a4, $zero, 1000
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB14_2: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a5, $a0, 0
+; CHECK-NEXT:    blt $a4, $a5, .LBB14_5
+; CHECK-NEXT:  # %bb.3: # in Loop: Header=BB14_2 Depth=1
+; CHECK-NEXT:    add.d $a0, $a3, $a0
+; CHECK-NEXT:    addi.w $a3, $a3, 1
+; CHECK-NEXT:    blt $a3, $a1, .LBB14_2
+; CHECK-NEXT:  .LBB14_4:
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB14_5:
+; CHECK-NEXT:    addi.w $a0, $a2, 0
+; CHECK-NEXT:    ret
+  %3 = icmp sgt i32 %1, 1
+  br i1 %3, label %4, label %12
+
+4:                                                ; preds = %2, %8
+  %5 = phi i32 [ %10, %8 ], [ 1, %2 ]
+  %6 = phi i32 [ %9, %8 ], [ %0, %2 ]
+  %7 = icmp sgt i32 %6, 1000
+  br i1 %7, label %12, label %8
+
+8:                                                ; preds = %4
+  %9 = add nsw i32 %5, %6
+  %10 = add nuw nsw i32 %5, 1
+  %11 = icmp slt i32 %10, %1
+  br i1 %11, label %4, label %12
+
+12:                                               ; preds = %8, %4, %2
+  %13 = phi i32 [ %0, %2 ], [ -1, %4 ], [ %9, %8 ]
+  ret i32 %13
+}
+
+define signext i32 @test14c(i32 zeroext %0, i32 signext %1) {
+; CHECK-LABEL: test14c:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori $a2, $zero, 2
+; CHECK-NEXT:    blt $a1, $a2, .LBB15_4
+; CHECK-NEXT:  # %bb.1: # %.preheader
+; CHECK-NEXT:    ori $a3, $zero, 1
+; CHECK-NEXT:    addi.w $a2, $zero, -1
+; CHECK-NEXT:    lu32i.d $a2, 0
+; CHECK-NEXT:    ori $a4, $zero, 1000
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB15_2: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a5, $a0, 0
+; CHECK-NEXT:    blt $a4, $a5, .LBB15_5
+; CHECK-NEXT:  # %bb.3: # in Loop: Header=BB15_2 Depth=1
+; CHECK-NEXT:    add.d $a0, $a3, $a0
+; CHECK-NEXT:    addi.w $a3, $a3, 1
+; CHECK-NEXT:    blt $a3, $a1, .LBB15_2
+; CHECK-NEXT:  .LBB15_4:
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB15_5:
+; CHECK-NEXT:    addi.w $a0, $a2, 0
+; CHECK-NEXT:    ret
+  %3 = icmp sgt i32 %1, 1
+  br i1 %3, label %4, label %12
+
+4:                                                ; preds = %2, %8
+  %5 = phi i32 [ %10, %8 ], [ 1, %2 ]
+  %6 = phi i32 [ %9, %8 ], [ %0, %2 ]
+  %7 = icmp sgt i32 %6, 1000
+  br i1 %7, label %12, label %8
+
+8:                                                ; preds = %4
+  %9 = add nsw i32 %5, %6
+  %10 = add nuw nsw i32 %5, 1
+  %11 = icmp slt i32 %10, %1
+  br i1 %11, label %4, label %12
+
+12:                                               ; preds = %8, %4, %2
+  %13 = phi i32 [ %0, %2 ], [ -1, %4 ], [ %9, %8 ]
+  ret i32 %13
+}
+
+define signext i32 @test14d(i31 zeroext %0, i32 signext %1) {
+; CHECK-LABEL: test14d:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori $a2, $zero, 2
+; CHECK-NEXT:    blt $a1, $a2, .LBB16_4
+; CHECK-NEXT:  # %bb.1: # %.preheader
+; CHECK-NEXT:    ori $a3, $zero, 1
+; CHECK-NEXT:    addi.w $a2, $zero, -1
+; CHECK-NEXT:    lu32i.d $a2, 0
+; CHECK-NEXT:    ori $a4, $zero, 1000
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB16_2: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a5, $a0, 0
+; CHECK-NEXT:    blt $a4, $a5, .LBB16_5
+; CHECK-NEXT:  # %bb.3: # in Loop: Header=BB16_2 Depth=1
+; CHECK-NEXT:    add.d $a0, $a3, $a0
+; CHECK-NEXT:    addi.w $a3, $a3, 1
+; CHECK-NEXT:    blt $a3, $a1, .LBB16_2
+; CHECK-NEXT:  .LBB16_4:
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB16_5:
+; CHECK-NEXT:    addi.w $a0, $a2, 0
+; CHECK-NEXT:    ret
+  %zext = zext i31 %0 to i32
+  %3 = icmp sgt i32 %1, 1
+  br i1 %3, label %4, label %12
+
+4:                                                ; preds = %2, %8
+  %5 = phi i32 [ %10, %8 ], [ 1, %2 ]
+  %6 = phi i32 [ %9, %8 ], [ %zext, %2 ]
+  %7 = icmp sgt i32 %6, 1000
+  br i1 %7, label %12, label %8
+
+8:                                                ; preds = %4
+  %9 = add nsw i32 %5, %6
+  %10 = add nuw nsw i32 %5, 1
+  %11 = icmp slt i32 %10, %1
+  br i1 %11, label %4, label %12
+
+12:                                               ; preds = %8, %4, %2
+  %13 = phi i32 [ %zext, %2 ], [ -1, %4 ], [ %9, %8 ]
+  ret i32 %13
+}
+
+define signext i32 @test15(i64 %arg1, i64 %arg2, i64 %arg3, ptr %arg4)  {
+; CHECK-LABEL: test15:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi.d $a2, $a2, -1
+; CHECK-NEXT:    ori $a4, $zero, 256
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB17_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    andi $a0, $a0, 1234
+; CHECK-NEXT:    add.d $a0, $a0, $a1
+; CHECK-NEXT:    addi.d $a2, $a2, 1
+; CHECK-NEXT:    st.w $a0, $a3, 0
+; CHECK-NEXT:    bltu $a2, $a4, .LBB17_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %entry
+  %i1 = phi i64 [ %arg1, %entry ], [ %i5, %bb2 ]
+  %i2 = phi i64 [ %arg3, %entry ], [ %i3, %bb2 ]
+  %i3 = add i64 %i2, 1
+  %i4 = and i64 %i1, 1234
+  %i5 = add i64 %i4, %arg2
+  %i8 = trunc i64 %i5 to i32
+  store i32 %i8, ptr %arg4
+  %i6 = icmp ugt i64 %i2, 255
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  %i7 = trunc i64 %i5 to i32
+  ret i32 %i7
+}
+
+define signext i32 @bug(i32 signext %x) {
+; CHECK-LABEL: bug:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    beqz $a0, .LBB18_2
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    bstrpick.d $a1, $a0, 31, 16
+; CHECK-NEXT:    sltui $a1, $a1, 1
+; CHECK-NEXT:    slli.d $a2, $a0, 16
+; CHECK-NEXT:    masknez $a0, $a0, $a1
+; CHECK-NEXT:    maskeqz $a2, $a2, $a1
+; CHECK-NEXT:    or $a0, $a2, $a0
+; CHECK-NEXT:    ori $a2, $zero, 32
+; CHECK-NEXT:    masknez $a2, $a2, $a1
+; CHECK-NEXT:    ori $a3, $zero, 16
+; CHECK-NEXT:    maskeqz $a1, $a3, $a1
+; CHECK-NEXT:    or $a1, $a1, $a2
+; CHECK-NEXT:    bstrpick.d $a2, $a0, 31, 24
+; CHECK-NEXT:    sltui $a2, $a2, 1
+; CHECK-NEXT:    slli.d $a3, $a0, 8
+; CHECK-NEXT:    addi.d $a4, $a1, -8
+; CHECK-NEXT:    masknez $a0, $a0, $a2
+; CHECK-NEXT:    maskeqz $a3, $a3, $a2
+; CHECK-NEXT:    or $a0, $a3, $a0
+; CHECK-NEXT:    masknez $a1, $a1, $a2
+; CHECK-NEXT:    maskeqz $a2, $a4, $a2
+; CHECK-NEXT:    or $a1, $a2, $a1
+; CHECK-NEXT:    bstrpick.d $a2, $a0, 31, 28
+; CHECK-NEXT:    sltui $a2, $a2, 1
+; CHECK-NEXT:    slli.d $a3, $a0, 4
+; CHECK-NEXT:    addi.d $a4, $a1, -4
+; CHECK-NEXT:    masknez $a0, $a0, $a2
+; CHECK-NEXT:    maskeqz $a3, $a3, $a2
+; CHECK-NEXT:    or $a0, $a3, $a0
+; CHECK-NEXT:    masknez $a1, $a1, $a2
+; CHECK-NEXT:    maskeqz $a2, $a4, $a2
+; CHECK-NEXT:    or $a1, $a2, $a1
+; CHECK-NEXT:    bstrpick.d $a2, $a0, 31, 30
+; CHECK-NEXT:    sltui $a2, $a2, 1
+; CHECK-NEXT:    slli.d $a3, $a0, 2
+; CHECK-NEXT:    addi.d $a4, $a1, -2
+; CHECK-NEXT:    masknez $a0, $a0, $a2
+; CHECK-NEXT:    maskeqz $a3, $a3, $a2
+; CHECK-NEXT:    or $a0, $a3, $a0
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    masknez $a1, $a1, $a2
+; CHECK-NEXT:    maskeqz $a2, $a4, $a2
+; CHECK-NEXT:    or $a1, $a2, $a1
+; CHECK-NEXT:    srai.d $a0, $a0, 31
+; CHECK-NEXT:    nor $a0, $a0, $zero
+; CHECK-NEXT:    add.d $a0, $a1, $a0
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB18_2:
+; CHECK-NEXT:    addi.w $a0, $zero, 0
+; CHECK-NEXT:    ret
+entry:
+  %tobool.not = icmp eq i32 %x, 0
+  br i1 %tobool.not, label %cleanup, label %if.end
+
+if.end:                                           ; preds = %entry
+  %tobool1.not = icmp ult i32 %x, 65536
+  %shl = shl i32 %x, 16
+  %spec.select = select i1 %tobool1.not, i32 %shl, i32 %x
+  %spec.select43 = select i1 %tobool1.not, i32 16, i32 32
+  %tobool5.not = icmp ult i32 %spec.select, 16777216
+  %shl7 = shl i32 %spec.select, 8
+  %sub8 = add nsw i32 %spec.select43, -8
+  %x.addr.1 = select i1 %tobool5.not, i32 %shl7, i32 %spec.select
+  %r.1 = select i1 %tobool5.not, i32 %sub8, i32 %spec.select43
+  %tobool11.not = icmp ult i32 %x.addr.1, 268435456
+  %shl13 = shl i32 %x.addr.1, 4
+  %sub14 = add nsw i32 %r.1, -4
+  %x.addr.2 = select i1 %tobool11.not, i32 %shl13, i32 %x.addr.1
+  %r.2 = select i1 %tobool11.not, i32 %sub14, i32 %r.1
+  %tobool17.not = icmp ult i32 %x.addr.2, 1073741824
+  %shl19 = shl i32 %x.addr.2, 2
+  %sub20 = add nsw i32 %r.2, -2
+  %x.addr.3 = select i1 %tobool17.not, i32 %shl19, i32 %x.addr.2
+  %r.3 = select i1 %tobool17.not, i32 %sub20, i32 %r.2
+  %x.addr.3.lobit = ashr i32 %x.addr.3, 31
+  %x.addr.3.lobit.not = xor i32 %x.addr.3.lobit, -1
+  %r.4 = add nsw i32 %r.3, %x.addr.3.lobit.not
+  br label %cleanup
+
+cleanup:                                          ; preds = %entry, %if.end
+  %retval.0 = phi i32 [ %r.4, %if.end ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define void @test16(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test16:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -32
+; CHECK-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    move $fp, $a1
+; CHECK-NEXT:    bl %plt(bar)
+; CHECK-NEXT:    move $s0, $a0
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB19_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $s0, 0
+; CHECK-NEXT:    bl %plt(bar)
+; CHECK-NEXT:    sll.w $s0, $s0, $fp
+; CHECK-NEXT:    bnez $a0, .LBB19_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 32
+; CHECK-NEXT:    ret
+bb:
+  %i = call signext i32 @bar(i32 signext %arg)
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i5, %bb2 ]
+  %i4 = tail call signext i32 @bar(i32 signext %i3)
+  %i5 = shl i32 %i3, %arg1
+  %i6 = icmp eq i32 %i4, 0
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+
+define void @test17(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test17:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -32
+; CHECK-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    move $fp, $a1
+; CHECK-NEXT:    bl %plt(bat)
+; CHECK-NEXT:    move $s0, $a0
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB20_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $s0, 0
+; CHECK-NEXT:    bl %plt(bar)
+; CHECK-NEXT:    sll.w $s0, $s0, $fp
+; CHECK-NEXT:    bnez $a0, .LBB20_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 32
+; CHECK-NEXT:    ret
+bb:
+  %i = call zeroext i16 @bat(i32 signext %arg)
+  %zext = zext i16 %i to i32
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %zext, %bb ], [ %i5, %bb2 ]
+  %i4 = tail call signext i32 @bar(i32 signext %i3)
+  %i5 = shl i32 %i3, %arg1
+  %i6 = icmp eq i32 %i4, 0
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+declare zeroext i16 @bat(i32 signext)
+
+define signext i32 @sextw_sh2add(i1 zeroext %0, ptr %1, i32 signext %2, i32 signext %3, i32 signext %4) {
+; CHECK-LABEL: sextw_sh2add:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    alsl.d $a2, $a2, $a3, 2
+; CHECK-NEXT:    beqz $a0, .LBB21_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    st.w $a2, $a1, 0
+; CHECK-NEXT:  .LBB21_2:
+; CHECK-NEXT:    add.w $a0, $a2, $a4
+; CHECK-NEXT:    ret
+  %6 = shl i32 %2, 2
+  %7 = add i32 %6, %3
+  br i1 %0, label %8, label %9
+
+8:                                                ; preds = %5
+  store i32 %7, ptr %1, align 4
+  br label %9
+
+9:                                                ; preds = %5, %8
+  %10 = add i32 %7, %4
+  ret i32 %10
+}
+
+define signext i32 @test19(i64 %arg, i1 zeroext %c1, i1 zeroext %c2, ptr %p) nounwind {
+; CHECK-LABEL: test19:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -16
+; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; CHECK-NEXT:    ori $a0, $zero, 35
+; CHECK-NEXT:    lu32i.d $a0, 1
+; CHECK-NEXT:    maskeqz $fp, $a0, $a1
+; CHECK-NEXT:    st.d $fp, $a3, 0
+; CHECK-NEXT:    beqz $a2, .LBB22_2
+; CHECK-NEXT:  # %bb.1: # %bb2
+; CHECK-NEXT:    move $a0, $zero
+; CHECK-NEXT:    bl %plt(bar)
+; CHECK-NEXT:    move $fp, $a0
+; CHECK-NEXT:  .LBB22_2: # %bb7
+; CHECK-NEXT:    bl %plt(side_effect)
+; CHECK-NEXT:    addi.w $a0, $fp, 0
+; CHECK-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 16
+; CHECK-NEXT:    ret
+bb:
+  %sel = select i1 %c1, i64 4294967331, i64 0
+  store i64 %sel, ptr %p, align 8
+  br i1 %c2, label %bb2, label %bb7
+
+bb2:                                              ; preds = %bb2, %bb
+  %i4 = call signext i32 @bar(i32 0)
+  %i4.sext = sext i32 %i4 to i64
+  br label %bb7
+
+bb7:                                              ; preds = %bb2
+  %phi = phi i64 [ %sel, %bb ], [ %i4.sext, %bb2 ]
+  %trunc = trunc i64 %phi to i32
+  call void @side_effect()
+  ret i32 %trunc
+}
+
+ declare void @side_effect(i64)
diff --git a/llvm/test/CodeGen/PowerPC/fminimum-fmaximum-f128.ll b/llvm/test/CodeGen/PowerPC/fminimum-fmaximum-f128.ll
new file mode 100644
index 000000000000..6d9eb1337682
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/fminimum-fmaximum-f128.ll
@@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 < %s | FileCheck %s
+
+define fp128 @f128_minimum(fp128 %a, fp128 %b) {
+; CHECK-LABEL: f128_minimum:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xscmpuqp 0, 2, 3
+; CHECK-NEXT:    vmr 4, 2
+; CHECK-NEXT:    bge 0, .LBB0_8
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    bun 0, .LBB0_9
+; CHECK-NEXT:  .LBB0_2: # %entry
+; CHECK-NEXT:    xststdcqp 0, 2, 4
+; CHECK-NEXT:    bc 4, 2, .LBB0_10
+; CHECK-NEXT:  .LBB0_3: # %entry
+; CHECK-NEXT:    xststdcqp 0, 3, 4
+; CHECK-NEXT:    bc 12, 2, .LBB0_5
+; CHECK-NEXT:  .LBB0_4: # %entry
+; CHECK-NEXT:    vmr 3, 2
+; CHECK-NEXT:  .LBB0_5: # %entry
+; CHECK-NEXT:    addis 3, 2, .LCPI0_1@toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI0_1@toc@l
+; CHECK-NEXT:    lxv 34, 0(3)
+; CHECK-NEXT:    xscmpuqp 0, 4, 2
+; CHECK-NEXT:    beq 0, .LBB0_7
+; CHECK-NEXT:  # %bb.6: # %entry
+; CHECK-NEXT:    vmr 3, 4
+; CHECK-NEXT:  .LBB0_7: # %entry
+; CHECK-NEXT:    vmr 2, 3
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB0_8: # %entry
+; CHECK-NEXT:    vmr 4, 3
+; CHECK-NEXT:    bnu 0, .LBB0_2
+; CHECK-NEXT:  .LBB0_9:
+; CHECK-NEXT:    addis 3, 2, .LCPI0_0@toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI0_0@toc@l
+; CHECK-NEXT:    lxv 36, 0(3)
+; CHECK-NEXT:    xststdcqp 0, 2, 4
+; CHECK-NEXT:    bc 12, 2, .LBB0_3
+; CHECK-NEXT:  .LBB0_10: # %entry
+; CHECK-NEXT:    vmr 2, 4
+; CHECK-NEXT:    xststdcqp 0, 3, 4
+; CHECK-NEXT:    bc 4, 2, .LBB0_4
+; CHECK-NEXT:    b .LBB0_5
+entry:
+  %m = call fp128 @llvm.minimum.f128(fp128 %a, fp128 %b)
+  ret fp128 %m
+}
+
+define fp128 @f128_maximum(fp128 %a, fp128 %b) {
+; CHECK-LABEL: f128_maximum:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xscmpuqp 0, 2, 3
+; CHECK-NEXT:    vmr 4, 2
+; CHECK-NEXT:    ble 0, .LBB1_8
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    bun 0, .LBB1_9
+; CHECK-NEXT:  .LBB1_2: # %entry
+; CHECK-NEXT:    xststdcqp 0, 2, 8
+; CHECK-NEXT:    bc 4, 2, .LBB1_10
+; CHECK-NEXT:  .LBB1_3: # %entry
+; CHECK-NEXT:    xststdcqp 0, 3, 8
+; CHECK-NEXT:    bc 12, 2, .LBB1_5
+; CHECK-NEXT:  .LBB1_4: # %entry
+; CHECK-NEXT:    vmr 3, 2
+; CHECK-NEXT:  .LBB1_5: # %entry
+; CHECK-NEXT:    addis 3, 2, .LCPI1_1@toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI1_1@toc@l
+; CHECK-NEXT:    lxv 34, 0(3)
+; CHECK-NEXT:    xscmpuqp 0, 4, 2
+; CHECK-NEXT:    beq 0, .LBB1_7
+; CHECK-NEXT:  # %bb.6: # %entry
+; CHECK-NEXT:    vmr 3, 4
+; CHECK-NEXT:  .LBB1_7: # %entry
+; CHECK-NEXT:    vmr 2, 3
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB1_8: # %entry
+; CHECK-NEXT:    vmr 4, 3
+; CHECK-NEXT:    bnu 0, .LBB1_2
+; CHECK-NEXT:  .LBB1_9:
+; CHECK-NEXT:    addis 3, 2, .LCPI1_0@toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI1_0@toc@l
+; CHECK-NEXT:    lxv 36, 0(3)
+; CHECK-NEXT:    xststdcqp 0, 2, 8
+; CHECK-NEXT:    bc 12, 2, .LBB1_3
+; CHECK-NEXT:  .LBB1_10: # %entry
+; CHECK-NEXT:    vmr 2, 4
+; CHECK-NEXT:    xststdcqp 0, 3, 8
+; CHECK-NEXT:    bc 4, 2, .LBB1_4
+; CHECK-NEXT:    b .LBB1_5
+entry:
+  %m = call fp128 @llvm.maximum.f128(fp128 %a, fp128 %b)
+  ret fp128 %m
+}
+
+declare fp128 @llvm.minimum.f128(fp128, fp128)
+declare fp128 @llvm.maximum.f128(fp128, fp128)
diff --git a/llvm/test/CodeGen/PowerPC/fminimum-fmaximum.ll b/llvm/test/CodeGen/PowerPC/fminimum-fmaximum.ll
new file mode 100644
index 000000000000..c33875dbfee4
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/fminimum-fmaximum.ll
@@ -0,0 +1,847 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s --check-prefix=NOVSX
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s --check-prefix=VSX
+; RUN: llc -mtriple=powerpc64-ibm-aix -mcpu=pwr8 < %s | FileCheck %s --check-prefix=AIX
+
+define float @f32_minimum(float %a, float %b) {
+; NOVSX-LABEL: f32_minimum:
+; NOVSX:       # %bb.0: # %entry
+; NOVSX-NEXT:    fcmpu 0, 1, 2
+; NOVSX-NEXT:    fmr 0, 1
+; NOVSX-NEXT:    stfs 2, -8(1)
+; NOVSX-NEXT:    stfs 1, -4(1)
+; NOVSX-NEXT:    bc 12, 0, .LBB0_2
+; NOVSX-NEXT:  # %bb.1: # %entry
+; NOVSX-NEXT:    fmr 0, 2
+; NOVSX-NEXT:  .LBB0_2: # %entry
+; NOVSX-NEXT:    lwz 3, -4(1)
+; NOVSX-NEXT:    bc 4, 3, .LBB0_4
+; NOVSX-NEXT:  # %bb.3:
+; NOVSX-NEXT:    addis 4, 2, .LCPI0_0@toc@ha
+; NOVSX-NEXT:    lfs 0, .LCPI0_0@toc@l(4)
+; NOVSX-NEXT:  .LBB0_4: # %entry
+; NOVSX-NEXT:    xoris 3, 3, 32768
+; NOVSX-NEXT:    lwz 4, -8(1)
+; NOVSX-NEXT:    cmplwi 3, 0
+; NOVSX-NEXT:    bc 12, 2, .LBB0_6
+; NOVSX-NEXT:  # %bb.5: # %entry
+; NOVSX-NEXT:    fmr 1, 0
+; NOVSX-NEXT:  .LBB0_6: # %entry
+; NOVSX-NEXT:    xoris 3, 4, 32768
+; NOVSX-NEXT:    cmplwi 3, 0
+; NOVSX-NEXT:    bc 12, 2, .LBB0_8
+; NOVSX-NEXT:  # %bb.7: # %entry
+; NOVSX-NEXT:    fmr 2, 1
+; NOVSX-NEXT:  .LBB0_8: # %entry
+; NOVSX-NEXT:    addis 3, 2, .LCPI0_1@toc@ha
+; NOVSX-NEXT:    lfs 1, .LCPI0_1@toc@l(3)
+; NOVSX-NEXT:    fcmpu 0, 0, 1
+; NOVSX-NEXT:    bc 12, 2, .LBB0_10
+; NOVSX-NEXT:  # %bb.9: # %entry
+; NOVSX-NEXT:    fmr 2, 0
+; NOVSX-NEXT:  .LBB0_10: # %entry
+; NOVSX-NEXT:    fmr 1, 2
+; NOVSX-NEXT:    blr
+;
+; VSX-LABEL: f32_minimum:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xscvdpspn 0, 1
+; VSX-NEXT:    fcmpu 0, 1, 2
+; VSX-NEXT:    xscvdpspn 3, 2
+; VSX-NEXT:    mffprwz 3, 0
+; VSX-NEXT:    bc 12, 3, .LBB0_2
+; VSX-NEXT:  # %bb.1: # %entry
+; VSX-NEXT:    xsmindp 0, 1, 2
+; VSX-NEXT:    b .LBB0_3
+; VSX-NEXT:  .LBB0_2:
+; VSX-NEXT:    addis 4, 2, .LCPI0_0@toc@ha
+; VSX-NEXT:    lfs 0, .LCPI0_0@toc@l(4)
+; VSX-NEXT:  .LBB0_3: # %entry
+; VSX-NEXT:    xoris 3, 3, 32768
+; VSX-NEXT:    mffprwz 4, 3
+; VSX-NEXT:    cmplwi 3, 0
+; VSX-NEXT:    bc 12, 2, .LBB0_5
+; VSX-NEXT:  # %bb.4: # %entry
+; VSX-NEXT:    fmr 1, 0
+; VSX-NEXT:  .LBB0_5: # %entry
+; VSX-NEXT:    xoris 3, 4, 32768
+; VSX-NEXT:    cmplwi 3, 0
+; VSX-NEXT:    bc 12, 2, .LBB0_7
+; VSX-NEXT:  # %bb.6: # %entry
+; VSX-NEXT:    fmr 2, 1
+; VSX-NEXT:  .LBB0_7: # %entry
+; VSX-NEXT:    xxlxor 1, 1, 1
+; VSX-NEXT:    fcmpu 0, 0, 1
+; VSX-NEXT:    bc 12, 2, .LBB0_9
+; VSX-NEXT:  # %bb.8: # %entry
+; VSX-NEXT:    fmr 2, 0
+; VSX-NEXT:  .LBB0_9: # %entry
+; VSX-NEXT:    fmr 1, 2
+; VSX-NEXT:    blr
+;
+; AIX-LABEL: f32_minimum:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    xscvdpspn 0, 1
+; AIX-NEXT:    fcmpu 0, 1, 2
+; AIX-NEXT:    xscvdpspn 3, 2
+; AIX-NEXT:    mffprwz 3, 0
+; AIX-NEXT:    bc 12, 3, L..BB0_2
+; AIX-NEXT:  # %bb.1: # %entry
+; AIX-NEXT:    xsmindp 0, 1, 2
+; AIX-NEXT:    b L..BB0_3
+; AIX-NEXT:  L..BB0_2:
+; AIX-NEXT:    ld 4, L..C0(2) # %const.0
+; AIX-NEXT:    lfs 0, 0(4)
+; AIX-NEXT:  L..BB0_3: # %entry
+; AIX-NEXT:    xoris 3, 3, 32768
+; AIX-NEXT:    mffprwz 4, 3
+; AIX-NEXT:    cmplwi 3, 0
+; AIX-NEXT:    bc 12, 2, L..BB0_5
+; AIX-NEXT:  # %bb.4: # %entry
+; AIX-NEXT:    fmr 1, 0
+; AIX-NEXT:  L..BB0_5: # %entry
+; AIX-NEXT:    xoris 3, 4, 32768
+; AIX-NEXT:    cmplwi 3, 0
+; AIX-NEXT:    bc 12, 2, L..BB0_7
+; AIX-NEXT:  # %bb.6: # %entry
+; AIX-NEXT:    fmr 2, 1
+; AIX-NEXT:  L..BB0_7: # %entry
+; AIX-NEXT:    xxlxor 1, 1, 1
+; AIX-NEXT:    fcmpu 0, 0, 1
+; AIX-NEXT:    bc 12, 2, L..BB0_9
+; AIX-NEXT:  # %bb.8: # %entry
+; AIX-NEXT:    fmr 2, 0
+; AIX-NEXT:  L..BB0_9: # %entry
+; AIX-NEXT:    fmr 1, 2
+; AIX-NEXT:    blr
+entry:
+  %m = call float @llvm.minimum.f32(float %a, float %b)
+  ret float %m
+}
+
+define float @f32_maximum(float %a, float %b) {
+; NOVSX-LABEL: f32_maximum:
+; NOVSX:       # %bb.0: # %entry
+; NOVSX-NEXT:    fcmpu 0, 1, 2
+; NOVSX-NEXT:    fmr 0, 1
+; NOVSX-NEXT:    stfs 2, -8(1)
+; NOVSX-NEXT:    stfs 1, -4(1)
+; NOVSX-NEXT:    bc 12, 1, .LBB1_2
+; NOVSX-NEXT:  # %bb.1: # %entry
+; NOVSX-NEXT:    fmr 0, 2
+; NOVSX-NEXT:  .LBB1_2: # %entry
+; NOVSX-NEXT:    lwz 3, -4(1)
+; NOVSX-NEXT:    bc 4, 3, .LBB1_4
+; NOVSX-NEXT:  # %bb.3:
+; NOVSX-NEXT:    addis 4, 2, .LCPI1_0@toc@ha
+; NOVSX-NEXT:    lfs 0, .LCPI1_0@toc@l(4)
+; NOVSX-NEXT:  .LBB1_4: # %entry
+; NOVSX-NEXT:    cmpwi 3, 0
+; NOVSX-NEXT:    lwz 4, -8(1)
+; NOVSX-NEXT:    bc 12, 2, .LBB1_6
+; NOVSX-NEXT:  # %bb.5: # %entry
+; NOVSX-NEXT:    fmr 1, 0
+; NOVSX-NEXT:  .LBB1_6: # %entry
+; NOVSX-NEXT:    cmpwi 4, 0
+; NOVSX-NEXT:    bc 12, 2, .LBB1_8
+; NOVSX-NEXT:  # %bb.7: # %entry
+; NOVSX-NEXT:    fmr 2, 1
+; NOVSX-NEXT:  .LBB1_8: # %entry
+; NOVSX-NEXT:    addis 3, 2, .LCPI1_1@toc@ha
+; NOVSX-NEXT:    lfs 1, .LCPI1_1@toc@l(3)
+; NOVSX-NEXT:    fcmpu 0, 0, 1
+; NOVSX-NEXT:    bc 12, 2, .LBB1_10
+; NOVSX-NEXT:  # %bb.9: # %entry
+; NOVSX-NEXT:    fmr 2, 0
+; NOVSX-NEXT:  .LBB1_10: # %entry
+; NOVSX-NEXT:    fmr 1, 2
+; NOVSX-NEXT:    blr
+;
+; VSX-LABEL: f32_maximum:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xscvdpspn 0, 1
+; VSX-NEXT:    fcmpu 0, 1, 2
+; VSX-NEXT:    xscvdpspn 3, 2
+; VSX-NEXT:    mffprwz 3, 0
+; VSX-NEXT:    bc 12, 3, .LBB1_2
+; VSX-NEXT:  # %bb.1: # %entry
+; VSX-NEXT:    xsmaxdp 0, 1, 2
+; VSX-NEXT:    b .LBB1_3
+; VSX-NEXT:  .LBB1_2:
+; VSX-NEXT:    addis 4, 2, .LCPI1_0@toc@ha
+; VSX-NEXT:    lfs 0, .LCPI1_0@toc@l(4)
+; VSX-NEXT:  .LBB1_3: # %entry
+; VSX-NEXT:    mffprwz 4, 3
+; VSX-NEXT:    cmpwi 3, 0
+; VSX-NEXT:    bc 12, 2, .LBB1_5
+; VSX-NEXT:  # %bb.4: # %entry
+; VSX-NEXT:    fmr 1, 0
+; VSX-NEXT:  .LBB1_5: # %entry
+; VSX-NEXT:    cmpwi 4, 0
+; VSX-NEXT:    bc 12, 2, .LBB1_7
+; VSX-NEXT:  # %bb.6: # %entry
+; VSX-NEXT:    fmr 2, 1
+; VSX-NEXT:  .LBB1_7: # %entry
+; VSX-NEXT:    xxlxor 1, 1, 1
+; VSX-NEXT:    fcmpu 0, 0, 1
+; VSX-NEXT:    bc 12, 2, .LBB1_9
+; VSX-NEXT:  # %bb.8: # %entry
+; VSX-NEXT:    fmr 2, 0
+; VSX-NEXT:  .LBB1_9: # %entry
+; VSX-NEXT:    fmr 1, 2
+; VSX-NEXT:    blr
+;
+; AIX-LABEL: f32_maximum:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    xscvdpspn 0, 1
+; AIX-NEXT:    fcmpu 0, 1, 2
+; AIX-NEXT:    xscvdpspn 3, 2
+; AIX-NEXT:    mffprwz 3, 0
+; AIX-NEXT:    bc 12, 3, L..BB1_2
+; AIX-NEXT:  # %bb.1: # %entry
+; AIX-NEXT:    xsmaxdp 0, 1, 2
+; AIX-NEXT:    b L..BB1_3
+; AIX-NEXT:  L..BB1_2:
+; AIX-NEXT:    ld 4, L..C1(2) # %const.0
+; AIX-NEXT:    lfs 0, 0(4)
+; AIX-NEXT:  L..BB1_3: # %entry
+; AIX-NEXT:    mffprwz 4, 3
+; AIX-NEXT:    cmpwi 3, 0
+; AIX-NEXT:    bc 12, 2, L..BB1_5
+; AIX-NEXT:  # %bb.4: # %entry
+; AIX-NEXT:    fmr 1, 0
+; AIX-NEXT:  L..BB1_5: # %entry
+; AIX-NEXT:    cmpwi 4, 0
+; AIX-NEXT:    bc 12, 2, L..BB1_7
+; AIX-NEXT:  # %bb.6: # %entry
+; AIX-NEXT:    fmr 2, 1
+; AIX-NEXT:  L..BB1_7: # %entry
+; AIX-NEXT:    xxlxor 1, 1, 1
+; AIX-NEXT:    fcmpu 0, 0, 1
+; AIX-NEXT:    bc 12, 2, L..BB1_9
+; AIX-NEXT:  # %bb.8: # %entry
+; AIX-NEXT:    fmr 2, 0
+; AIX-NEXT:  L..BB1_9: # %entry
+; AIX-NEXT:    fmr 1, 2
+; AIX-NEXT:    blr
+entry:
+  %m = call float @llvm.maximum.f32(float %a, float %b)
+  ret float %m
+}
+
+define double @f64_minimum(double %a, double %b) {
+; NOVSX-LABEL: f64_minimum:
+; NOVSX:       # %bb.0: # %entry
+; NOVSX-NEXT:    fcmpu 0, 1, 2
+; NOVSX-NEXT:    fmr 0, 1
+; NOVSX-NEXT:    stfd 2, -16(1)
+; NOVSX-NEXT:    stfd 1, -8(1)
+; NOVSX-NEXT:    bc 12, 0, .LBB2_2
+; NOVSX-NEXT:  # %bb.1: # %entry
+; NOVSX-NEXT:    fmr 0, 2
+; NOVSX-NEXT:  .LBB2_2: # %entry
+; NOVSX-NEXT:    ld 3, -8(1)
+; NOVSX-NEXT:    bc 4, 3, .LBB2_4
+; NOVSX-NEXT:  # %bb.3:
+; NOVSX-NEXT:    addis 4, 2, .LCPI2_0@toc@ha
+; NOVSX-NEXT:    lfs 0, .LCPI2_0@toc@l(4)
+; NOVSX-NEXT:  .LBB2_4: # %entry
+; NOVSX-NEXT:    li 5, 1
+; NOVSX-NEXT:    ld 4, -16(1)
+; NOVSX-NEXT:    rldic 5, 5, 63, 0
+; NOVSX-NEXT:    cmpd 3, 5
+; NOVSX-NEXT:    bc 12, 2, .LBB2_6
+; NOVSX-NEXT:  # %bb.5: # %entry
+; NOVSX-NEXT:    fmr 1, 0
+; NOVSX-NEXT:  .LBB2_6: # %entry
+; NOVSX-NEXT:    cmpd 4, 5
+; NOVSX-NEXT:    bc 12, 2, .LBB2_8
+; NOVSX-NEXT:  # %bb.7: # %entry
+; NOVSX-NEXT:    fmr 2, 1
+; NOVSX-NEXT:  .LBB2_8: # %entry
+; NOVSX-NEXT:    addis 3, 2, .LCPI2_1@toc@ha
+; NOVSX-NEXT:    lfs 1, .LCPI2_1@toc@l(3)
+; NOVSX-NEXT:    fcmpu 0, 0, 1
+; NOVSX-NEXT:    bc 12, 2, .LBB2_10
+; NOVSX-NEXT:  # %bb.9: # %entry
+; NOVSX-NEXT:    fmr 2, 0
+; NOVSX-NEXT:  .LBB2_10: # %entry
+; NOVSX-NEXT:    fmr 1, 2
+; NOVSX-NEXT:    blr
+;
+; VSX-LABEL: f64_minimum:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    fcmpu 0, 1, 2
+; VSX-NEXT:    mffprd 3, 1
+; VSX-NEXT:    bc 12, 3, .LBB2_2
+; VSX-NEXT:  # %bb.1: # %entry
+; VSX-NEXT:    xsmindp 0, 1, 2
+; VSX-NEXT:    b .LBB2_3
+; VSX-NEXT:  .LBB2_2:
+; VSX-NEXT:    addis 4, 2, .LCPI2_0@toc@ha
+; VSX-NEXT:    lfs 0, .LCPI2_0@toc@l(4)
+; VSX-NEXT:  .LBB2_3: # %entry
+; VSX-NEXT:    li 5, 1
+; VSX-NEXT:    mffprd 4, 2
+; VSX-NEXT:    rldic 5, 5, 63, 0
+; VSX-NEXT:    cmpd 3, 5
+; VSX-NEXT:    bc 12, 2, .LBB2_5
+; VSX-NEXT:  # %bb.4: # %entry
+; VSX-NEXT:    fmr 1, 0
+; VSX-NEXT:  .LBB2_5: # %entry
+; VSX-NEXT:    cmpd 4, 5
+; VSX-NEXT:    bc 12, 2, .LBB2_7
+; VSX-NEXT:  # %bb.6: # %entry
+; VSX-NEXT:    fmr 2, 1
+; VSX-NEXT:  .LBB2_7: # %entry
+; VSX-NEXT:    xxlxor 1, 1, 1
+; VSX-NEXT:    fcmpu 0, 0, 1
+; VSX-NEXT:    bc 12, 2, .LBB2_9
+; VSX-NEXT:  # %bb.8: # %entry
+; VSX-NEXT:    fmr 2, 0
+; VSX-NEXT:  .LBB2_9: # %entry
+; VSX-NEXT:    fmr 1, 2
+; VSX-NEXT:    blr
+;
+; AIX-LABEL: f64_minimum:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    fcmpu 0, 1, 2
+; AIX-NEXT:    mffprd 3, 1
+; AIX-NEXT:    bc 12, 3, L..BB2_2
+; AIX-NEXT:  # %bb.1: # %entry
+; AIX-NEXT:    xsmindp 0, 1, 2
+; AIX-NEXT:    b L..BB2_3
+; AIX-NEXT:  L..BB2_2:
+; AIX-NEXT:    ld 4, L..C2(2) # %const.0
+; AIX-NEXT:    lfs 0, 0(4)
+; AIX-NEXT:  L..BB2_3: # %entry
+; AIX-NEXT:    li 5, 1
+; AIX-NEXT:    mffprd 4, 2
+; AIX-NEXT:    rldic 5, 5, 63, 0
+; AIX-NEXT:    cmpd 3, 5
+; AIX-NEXT:    bc 12, 2, L..BB2_5
+; AIX-NEXT:  # %bb.4: # %entry
+; AIX-NEXT:    fmr 1, 0
+; AIX-NEXT:  L..BB2_5: # %entry
+; AIX-NEXT:    cmpd 4, 5
+; AIX-NEXT:    bc 12, 2, L..BB2_7
+; AIX-NEXT:  # %bb.6: # %entry
+; AIX-NEXT:    fmr 2, 1
+; AIX-NEXT:  L..BB2_7: # %entry
+; AIX-NEXT:    xxlxor 1, 1, 1
+; AIX-NEXT:    fcmpu 0, 0, 1
+; AIX-NEXT:    bc 12, 2, L..BB2_9
+; AIX-NEXT:  # %bb.8: # %entry
+; AIX-NEXT:    fmr 2, 0
+; AIX-NEXT:  L..BB2_9: # %entry
+; AIX-NEXT:    fmr 1, 2
+; AIX-NEXT:    blr
+entry:
+  %m = call double @llvm.minimum.f64(double %a, double %b)
+  ret double %m
+}
+
+define double @f64_maximum(double %a, double %b) {
+; NOVSX-LABEL: f64_maximum:
+; NOVSX:       # %bb.0: # %entry
+; NOVSX-NEXT:    fcmpu 0, 1, 2
+; NOVSX-NEXT:    fmr 0, 1
+; NOVSX-NEXT:    stfd 2, -16(1)
+; NOVSX-NEXT:    stfd 1, -8(1)
+; NOVSX-NEXT:    bc 12, 1, .LBB3_2
+; NOVSX-NEXT:  # %bb.1: # %entry
+; NOVSX-NEXT:    fmr 0, 2
+; NOVSX-NEXT:  .LBB3_2: # %entry
+; NOVSX-NEXT:    ld 3, -8(1)
+; NOVSX-NEXT:    bc 4, 3, .LBB3_4
+; NOVSX-NEXT:  # %bb.3:
+; NOVSX-NEXT:    addis 4, 2, .LCPI3_0@toc@ha
+; NOVSX-NEXT:    lfs 0, .LCPI3_0@toc@l(4)
+; NOVSX-NEXT:  .LBB3_4: # %entry
+; NOVSX-NEXT:    cmpdi 3, 0
+; NOVSX-NEXT:    ld 4, -16(1)
+; NOVSX-NEXT:    bc 12, 2, .LBB3_6
+; NOVSX-NEXT:  # %bb.5: # %entry
+; NOVSX-NEXT:    fmr 1, 0
+; NOVSX-NEXT:  .LBB3_6: # %entry
+; NOVSX-NEXT:    cmpdi 4, 0
+; NOVSX-NEXT:    bc 12, 2, .LBB3_8
+; NOVSX-NEXT:  # %bb.7: # %entry
+; NOVSX-NEXT:    fmr 2, 1
+; NOVSX-NEXT:  .LBB3_8: # %entry
+; NOVSX-NEXT:    addis 3, 2, .LCPI3_1@toc@ha
+; NOVSX-NEXT:    lfs 1, .LCPI3_1@toc@l(3)
+; NOVSX-NEXT:    fcmpu 0, 0, 1
+; NOVSX-NEXT:    bc 12, 2, .LBB3_10
+; NOVSX-NEXT:  # %bb.9: # %entry
+; NOVSX-NEXT:    fmr 2, 0
+; NOVSX-NEXT:  .LBB3_10: # %entry
+; NOVSX-NEXT:    fmr 1, 2
+; NOVSX-NEXT:    blr
+;
+; VSX-LABEL: f64_maximum:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    fcmpu 0, 1, 2
+; VSX-NEXT:    mffprd 3, 1
+; VSX-NEXT:    bc 12, 3, .LBB3_2
+; VSX-NEXT:  # %bb.1: # %entry
+; VSX-NEXT:    xsmaxdp 0, 1, 2
+; VSX-NEXT:    b .LBB3_3
+; VSX-NEXT:  .LBB3_2:
+; VSX-NEXT:    addis 4, 2, .LCPI3_0@toc@ha
+; VSX-NEXT:    lfs 0, .LCPI3_0@toc@l(4)
+; VSX-NEXT:  .LBB3_3: # %entry
+; VSX-NEXT:    mffprd 4, 2
+; VSX-NEXT:    cmpdi 3, 0
+; VSX-NEXT:    bc 12, 2, .LBB3_5
+; VSX-NEXT:  # %bb.4: # %entry
+; VSX-NEXT:    fmr 1, 0
+; VSX-NEXT:  .LBB3_5: # %entry
+; VSX-NEXT:    cmpdi 4, 0
+; VSX-NEXT:    bc 12, 2, .LBB3_7
+; VSX-NEXT:  # %bb.6: # %entry
+; VSX-NEXT:    fmr 2, 1
+; VSX-NEXT:  .LBB3_7: # %entry
+; VSX-NEXT:    xxlxor 1, 1, 1
+; VSX-NEXT:    fcmpu 0, 0, 1
+; VSX-NEXT:    bc 12, 2, .LBB3_9
+; VSX-NEXT:  # %bb.8: # %entry
+; VSX-NEXT:    fmr 2, 0
+; VSX-NEXT:  .LBB3_9: # %entry
+; VSX-NEXT:    fmr 1, 2
+; VSX-NEXT:    blr
+;
+; AIX-LABEL: f64_maximum:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    fcmpu 0, 1, 2
+; AIX-NEXT:    mffprd 3, 1
+; AIX-NEXT:    bc 12, 3, L..BB3_2
+; AIX-NEXT:  # %bb.1: # %entry
+; AIX-NEXT:    xsmaxdp 0, 1, 2
+; AIX-NEXT:    b L..BB3_3
+; AIX-NEXT:  L..BB3_2:
+; AIX-NEXT:    ld 4, L..C3(2) # %const.0
+; AIX-NEXT:    lfs 0, 0(4)
+; AIX-NEXT:  L..BB3_3: # %entry
+; AIX-NEXT:    mffprd 4, 2
+; AIX-NEXT:    cmpdi 3, 0
+; AIX-NEXT:    bc 12, 2, L..BB3_5
+; AIX-NEXT:  # %bb.4: # %entry
+; AIX-NEXT:    fmr 1, 0
+; AIX-NEXT:  L..BB3_5: # %entry
+; AIX-NEXT:    cmpdi 4, 0
+; AIX-NEXT:    bc 12, 2, L..BB3_7
+; AIX-NEXT:  # %bb.6: # %entry
+; AIX-NEXT:    fmr 2, 1
+; AIX-NEXT:  L..BB3_7: # %entry
+; AIX-NEXT:    xxlxor 1, 1, 1
+; AIX-NEXT:    fcmpu 0, 0, 1
+; AIX-NEXT:    bc 12, 2, L..BB3_9
+; AIX-NEXT:  # %bb.8: # %entry
+; AIX-NEXT:    fmr 2, 0
+; AIX-NEXT:  L..BB3_9: # %entry
+; AIX-NEXT:    fmr 1, 2
+; AIX-NEXT:    blr
+entry:
+  %m = call double @llvm.maximum.f64(double %a, double %b)
+  ret double %m
+}
+
+define <4 x float> @v4f32_minimum(<4 x float> %a, <4 x float> %b) {
+; NOVSX-LABEL: v4f32_minimum:
+; NOVSX:       # %bb.0: # %entry
+; NOVSX-NEXT:    vcmpeqfp 0, 3, 3
+; NOVSX-NEXT:    vcmpeqfp 1, 2, 2
+; NOVSX-NEXT:    addis 3, 2, .LCPI4_0@toc@ha
+; NOVSX-NEXT:    addi 3, 3, .LCPI4_0@toc@l
+; NOVSX-NEXT:    vnot 0, 0
+; NOVSX-NEXT:    vnot 1, 1
+; NOVSX-NEXT:    vspltisb 4, -1
+; NOVSX-NEXT:    vcmpgtfp 5, 3, 2
+; NOVSX-NEXT:    vslw 4, 4, 4
+; NOVSX-NEXT:    vor 0, 1, 0
+; NOVSX-NEXT:    lvx 1, 0, 3
+; NOVSX-NEXT:    vsel 5, 3, 2, 5
+; NOVSX-NEXT:    vsel 5, 5, 1, 0
+; NOVSX-NEXT:    vcmpequw 0, 2, 4
+; NOVSX-NEXT:    vcmpequw 4, 3, 4
+; NOVSX-NEXT:    vsel 2, 5, 2, 0
+; NOVSX-NEXT:    vsel 2, 2, 3, 4
+; NOVSX-NEXT:    vxor 3, 3, 3
+; NOVSX-NEXT:    vcmpeqfp 3, 5, 3
+; NOVSX-NEXT:    vsel 2, 5, 2, 3
+; NOVSX-NEXT:    blr
+;
+; VSX-LABEL: v4f32_minimum:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xvcmpeqsp 1, 35, 35
+; VSX-NEXT:    xvcmpeqsp 2, 34, 34
+; VSX-NEXT:    addis 3, 2, .LCPI4_0@toc@ha
+; VSX-NEXT:    xxleqv 36, 36, 36
+; VSX-NEXT:    xvminsp 0, 34, 35
+; VSX-NEXT:    vslw 4, 4, 4
+; VSX-NEXT:    addi 3, 3, .LCPI4_0@toc@l
+; VSX-NEXT:    xxlnor 1, 1, 1
+; VSX-NEXT:    xxlnor 2, 2, 2
+; VSX-NEXT:    vcmpequw 5, 2, 4
+; VSX-NEXT:    xxlor 1, 2, 1
+; VSX-NEXT:    lxvd2x 2, 0, 3
+; VSX-NEXT:    xxsel 0, 0, 2, 1
+; VSX-NEXT:    xxlxor 2, 2, 2
+; VSX-NEXT:    xvcmpeqsp 2, 0, 2
+; VSX-NEXT:    xxsel 1, 0, 34, 37
+; VSX-NEXT:    vcmpequw 2, 3, 4
+; VSX-NEXT:    xxsel 1, 1, 35, 34
+; VSX-NEXT:    xxsel 34, 0, 1, 2
+; VSX-NEXT:    blr
+;
+; AIX-LABEL: v4f32_minimum:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    xvcmpeqsp 1, 35, 35
+; AIX-NEXT:    xvcmpeqsp 2, 34, 34
+; AIX-NEXT:    ld 3, L..C4(2) # %const.0
+; AIX-NEXT:    xxleqv 36, 36, 36
+; AIX-NEXT:    xvminsp 0, 34, 35
+; AIX-NEXT:    vslw 4, 4, 4
+; AIX-NEXT:    xxlnor 1, 1, 1
+; AIX-NEXT:    xxlnor 2, 2, 2
+; AIX-NEXT:    vcmpequw 5, 2, 4
+; AIX-NEXT:    xxlor 1, 2, 1
+; AIX-NEXT:    lxvw4x 2, 0, 3
+; AIX-NEXT:    xxsel 0, 0, 2, 1
+; AIX-NEXT:    xxlxor 2, 2, 2
+; AIX-NEXT:    xvcmpeqsp 2, 0, 2
+; AIX-NEXT:    xxsel 1, 0, 34, 37
+; AIX-NEXT:    vcmpequw 2, 3, 4
+; AIX-NEXT:    xxsel 1, 1, 35, 34
+; AIX-NEXT:    xxsel 34, 0, 1, 2
+; AIX-NEXT:    blr
+entry:
+  %m = call <4 x float> @llvm.minimum.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %m
+}
+
+define <4 x float> @v4f32_maximum(<4 x float> %a, <4 x float> %b) {
+; NOVSX-LABEL: v4f32_maximum:
+; NOVSX:       # %bb.0: # %entry
+; NOVSX-NEXT:    vcmpeqfp 5, 3, 3
+; NOVSX-NEXT:    vcmpeqfp 0, 2, 2
+; NOVSX-NEXT:    addis 3, 2, .LCPI5_0@toc@ha
+; NOVSX-NEXT:    addi 3, 3, .LCPI5_0@toc@l
+; NOVSX-NEXT:    vnot 5, 5
+; NOVSX-NEXT:    vnot 0, 0
+; NOVSX-NEXT:    vcmpgtfp 4, 2, 3
+; NOVSX-NEXT:    vor 5, 0, 5
+; NOVSX-NEXT:    lvx 0, 0, 3
+; NOVSX-NEXT:    vsel 4, 3, 2, 4
+; NOVSX-NEXT:    vsel 4, 4, 0, 5
+; NOVSX-NEXT:    vxor 5, 5, 5
+; NOVSX-NEXT:    vcmpequw 0, 2, 5
+; NOVSX-NEXT:    vsel 2, 4, 2, 0
+; NOVSX-NEXT:    vcmpequw 0, 3, 5
+; NOVSX-NEXT:    vsel 2, 2, 3, 0
+; NOVSX-NEXT:    vcmpeqfp 3, 4, 5
+; NOVSX-NEXT:    vsel 2, 4, 2, 3
+; NOVSX-NEXT:    blr
+;
+; VSX-LABEL: v4f32_maximum:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xvcmpeqsp 1, 35, 35
+; VSX-NEXT:    xvcmpeqsp 2, 34, 34
+; VSX-NEXT:    addis 3, 2, .LCPI5_0@toc@ha
+; VSX-NEXT:    addi 3, 3, .LCPI5_0@toc@l
+; VSX-NEXT:    xxlnor 1, 1, 1
+; VSX-NEXT:    xxlnor 2, 2, 2
+; VSX-NEXT:    xvmaxsp 0, 34, 35
+; VSX-NEXT:    xxlxor 36, 36, 36
+; VSX-NEXT:    vcmpequw 5, 2, 4
+; VSX-NEXT:    xxlor 1, 2, 1
+; VSX-NEXT:    lxvd2x 2, 0, 3
+; VSX-NEXT:    xxsel 0, 0, 2, 1
+; VSX-NEXT:    xvcmpeqsp 2, 0, 36
+; VSX-NEXT:    xxsel 1, 0, 34, 37
+; VSX-NEXT:    vcmpequw 2, 3, 4
+; VSX-NEXT:    xxsel 1, 1, 35, 34
+; VSX-NEXT:    xxsel 34, 0, 1, 2
+; VSX-NEXT:    blr
+;
+; AIX-LABEL: v4f32_maximum:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    xvcmpeqsp 1, 35, 35
+; AIX-NEXT:    xvcmpeqsp 2, 34, 34
+; AIX-NEXT:    ld 3, L..C5(2) # %const.0
+; AIX-NEXT:    xvmaxsp 0, 34, 35
+; AIX-NEXT:    xxlxor 36, 36, 36
+; AIX-NEXT:    xxlnor 1, 1, 1
+; AIX-NEXT:    xxlnor 2, 2, 2
+; AIX-NEXT:    vcmpequw 5, 2, 4
+; AIX-NEXT:    xxlor 1, 2, 1
+; AIX-NEXT:    lxvw4x 2, 0, 3
+; AIX-NEXT:    xxsel 0, 0, 2, 1
+; AIX-NEXT:    xvcmpeqsp 2, 0, 36
+; AIX-NEXT:    xxsel 1, 0, 34, 37
+; AIX-NEXT:    vcmpequw 2, 3, 4
+; AIX-NEXT:    xxsel 1, 1, 35, 34
+; AIX-NEXT:    xxsel 34, 0, 1, 2
+; AIX-NEXT:    blr
+entry:
+  %m = call <4 x float> @llvm.maximum.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %m
+}
+
+define <2 x double> @v2f64_minimum(<2 x double> %a, <2 x double> %b) {
+; NOVSX-LABEL: v2f64_minimum:
+; NOVSX:       # %bb.0: # %entry
+; NOVSX-NEXT:    fcmpu 0, 1, 3
+; NOVSX-NEXT:    fmr 6, 1
+; NOVSX-NEXT:    stfd 4, -16(1)
+; NOVSX-NEXT:    stfd 2, -8(1)
+; NOVSX-NEXT:    stfd 3, -32(1)
+; NOVSX-NEXT:    stfd 1, -24(1)
+; NOVSX-NEXT:    bc 12, 0, .LBB6_2
+; NOVSX-NEXT:  # %bb.1: # %entry
+; NOVSX-NEXT:    fmr 6, 3
+; NOVSX-NEXT:  .LBB6_2: # %entry
+; NOVSX-NEXT:    addis 3, 2, .LCPI6_0@toc@ha
+; NOVSX-NEXT:    ld 4, -24(1)
+; NOVSX-NEXT:    lfs 0, .LCPI6_0@toc@l(3)
+; NOVSX-NEXT:    fmr 5, 0
+; NOVSX-NEXT:    bc 12, 3, .LBB6_4
+; NOVSX-NEXT:  # %bb.3: # %entry
+; NOVSX-NEXT:    fmr 5, 6
+; NOVSX-NEXT:  .LBB6_4: # %entry
+; NOVSX-NEXT:    li 3, 1
+; NOVSX-NEXT:    ld 5, -32(1)
+; NOVSX-NEXT:    rldic 3, 3, 63, 0
+; NOVSX-NEXT:    cmpd 4, 3
+; NOVSX-NEXT:    bc 12, 2, .LBB6_6
+; NOVSX-NEXT:  # %bb.5: # %entry
+; NOVSX-NEXT:    fmr 1, 5
+; NOVSX-NEXT:  .LBB6_6: # %entry
+; NOVSX-NEXT:    cmpd 5, 3
+; NOVSX-NEXT:    bc 12, 2, .LBB6_8
+; NOVSX-NEXT:  # %bb.7: # %entry
+; NOVSX-NEXT:    fmr 3, 1
+; NOVSX-NEXT:  .LBB6_8: # %entry
+; NOVSX-NEXT:    addis 4, 2, .LCPI6_1@toc@ha
+; NOVSX-NEXT:    lfs 1, .LCPI6_1@toc@l(4)
+; NOVSX-NEXT:    fcmpu 0, 5, 1
+; NOVSX-NEXT:    bc 12, 2, .LBB6_10
+; NOVSX-NEXT:  # %bb.9: # %entry
+; NOVSX-NEXT:    fmr 3, 5
+; NOVSX-NEXT:  .LBB6_10: # %entry
+; NOVSX-NEXT:    fcmpu 0, 2, 4
+; NOVSX-NEXT:    fmr 5, 2
+; NOVSX-NEXT:    bc 12, 0, .LBB6_12
+; NOVSX-NEXT:  # %bb.11: # %entry
+; NOVSX-NEXT:    fmr 5, 4
+; NOVSX-NEXT:  .LBB6_12: # %entry
+; NOVSX-NEXT:    ld 5, -8(1)
+; NOVSX-NEXT:    bc 12, 3, .LBB6_14
+; NOVSX-NEXT:  # %bb.13: # %entry
+; NOVSX-NEXT:    fmr 0, 5
+; NOVSX-NEXT:  .LBB6_14: # %entry
+; NOVSX-NEXT:    cmpd 5, 3
+; NOVSX-NEXT:    ld 4, -16(1)
+; NOVSX-NEXT:    bc 4, 2, .LBB6_19
+; NOVSX-NEXT:  # %bb.15: # %entry
+; NOVSX-NEXT:    cmpd 4, 3
+; NOVSX-NEXT:    bc 4, 2, .LBB6_20
+; NOVSX-NEXT:  .LBB6_16: # %entry
+; NOVSX-NEXT:    fcmpu 0, 0, 1
+; NOVSX-NEXT:    bc 12, 2, .LBB6_18
+; NOVSX-NEXT:  .LBB6_17: # %entry
+; NOVSX-NEXT:    fmr 4, 0
+; NOVSX-NEXT:  .LBB6_18: # %entry
+; NOVSX-NEXT:    fmr 1, 3
+; NOVSX-NEXT:    fmr 2, 4
+; NOVSX-NEXT:    blr
+; NOVSX-NEXT:  .LBB6_19: # %entry
+; NOVSX-NEXT:    fmr 2, 0
+; NOVSX-NEXT:    cmpd 4, 3
+; NOVSX-NEXT:    bc 12, 2, .LBB6_16
+; NOVSX-NEXT:  .LBB6_20: # %entry
+; NOVSX-NEXT:    fmr 4, 2
+; NOVSX-NEXT:    fcmpu 0, 0, 1
+; NOVSX-NEXT:    bc 4, 2, .LBB6_17
+; NOVSX-NEXT:    b .LBB6_18
+;
+; VSX-LABEL: v2f64_minimum:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    addis 3, 2, .LCPI6_0@toc@ha
+; VSX-NEXT:    xvcmpeqdp 36, 35, 35
+; VSX-NEXT:    xvcmpeqdp 37, 34, 34
+; VSX-NEXT:    addi 3, 3, .LCPI6_0@toc@l
+; VSX-NEXT:    xxlnor 36, 36, 36
+; VSX-NEXT:    xxlnor 37, 37, 37
+; VSX-NEXT:    xvmindp 0, 34, 35
+; VSX-NEXT:    lxvd2x 2, 0, 3
+; VSX-NEXT:    addis 3, 2, .LCPI6_1@toc@ha
+; VSX-NEXT:    xxlor 1, 37, 36
+; VSX-NEXT:    addi 3, 3, .LCPI6_1@toc@l
+; VSX-NEXT:    lxvd2x 36, 0, 3
+; VSX-NEXT:    vcmpequd 5, 2, 4
+; VSX-NEXT:    xxsel 0, 0, 2, 1
+; VSX-NEXT:    xxlxor 2, 2, 2
+; VSX-NEXT:    xxsel 1, 0, 34, 37
+; VSX-NEXT:    vcmpequd 2, 3, 4
+; VSX-NEXT:    xxsel 1, 1, 35, 34
+; VSX-NEXT:    xvcmpeqdp 34, 0, 2
+; VSX-NEXT:    xxsel 34, 0, 1, 34
+; VSX-NEXT:    blr
+;
+; AIX-LABEL: v2f64_minimum:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    ld 3, L..C6(2) # %const.0
+; AIX-NEXT:    xvcmpeqdp 36, 35, 35
+; AIX-NEXT:    xvcmpeqdp 37, 34, 34
+; AIX-NEXT:    lxvd2x 2, 0, 3
+; AIX-NEXT:    ld 3, L..C7(2) # %const.1
+; AIX-NEXT:    xxlnor 36, 36, 36
+; AIX-NEXT:    xxlnor 37, 37, 37
+; AIX-NEXT:    xvmindp 0, 34, 35
+; AIX-NEXT:    xxlor 1, 37, 36
+; AIX-NEXT:    lxvd2x 36, 0, 3
+; AIX-NEXT:    vcmpequd 5, 2, 4
+; AIX-NEXT:    xxsel 0, 0, 2, 1
+; AIX-NEXT:    xxlxor 2, 2, 2
+; AIX-NEXT:    xxsel 1, 0, 34, 37
+; AIX-NEXT:    vcmpequd 2, 3, 4
+; AIX-NEXT:    xxsel 1, 1, 35, 34
+; AIX-NEXT:    xvcmpeqdp 34, 0, 2
+; AIX-NEXT:    xxsel 34, 0, 1, 34
+; AIX-NEXT:    blr
+entry:
+  %m = call <2 x double> @llvm.minimum.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %m
+}
+
+define <2 x double> @v2f64_maximum(<2 x double> %a, <2 x double> %b) {
+; NOVSX-LABEL: v2f64_maximum:
+; NOVSX:       # %bb.0: # %entry
+; NOVSX-NEXT:    fcmpu 0, 1, 3
+; NOVSX-NEXT:    fmr 6, 1
+; NOVSX-NEXT:    stfd 4, -16(1)
+; NOVSX-NEXT:    stfd 2, -8(1)
+; NOVSX-NEXT:    stfd 3, -32(1)
+; NOVSX-NEXT:    stfd 1, -24(1)
+; NOVSX-NEXT:    bc 12, 1, .LBB7_2
+; NOVSX-NEXT:  # %bb.1: # %entry
+; NOVSX-NEXT:    fmr 6, 3
+; NOVSX-NEXT:  .LBB7_2: # %entry
+; NOVSX-NEXT:    addis 4, 2, .LCPI7_0@toc@ha
+; NOVSX-NEXT:    ld 3, -24(1)
+; NOVSX-NEXT:    lfs 0, .LCPI7_0@toc@l(4)
+; NOVSX-NEXT:    fmr 5, 0
+; NOVSX-NEXT:    bc 12, 3, .LBB7_4
+; NOVSX-NEXT:  # %bb.3: # %entry
+; NOVSX-NEXT:    fmr 5, 6
+; NOVSX-NEXT:  .LBB7_4: # %entry
+; NOVSX-NEXT:    cmpdi 3, 0
+; NOVSX-NEXT:    ld 4, -32(1)
+; NOVSX-NEXT:    bc 12, 2, .LBB7_6
+; NOVSX-NEXT:  # %bb.5: # %entry
+; NOVSX-NEXT:    fmr 1, 5
+; NOVSX-NEXT:  .LBB7_6: # %entry
+; NOVSX-NEXT:    cmpdi 4, 0
+; NOVSX-NEXT:    bc 12, 2, .LBB7_8
+; NOVSX-NEXT:  # %bb.7: # %entry
+; NOVSX-NEXT:    fmr 3, 1
+; NOVSX-NEXT:  .LBB7_8: # %entry
+; NOVSX-NEXT:    addis 3, 2, .LCPI7_1@toc@ha
+; NOVSX-NEXT:    lfs 1, .LCPI7_1@toc@l(3)
+; NOVSX-NEXT:    fcmpu 0, 5, 1
+; NOVSX-NEXT:    bc 12, 2, .LBB7_10
+; NOVSX-NEXT:  # %bb.9: # %entry
+; NOVSX-NEXT:    fmr 3, 5
+; NOVSX-NEXT:  .LBB7_10: # %entry
+; NOVSX-NEXT:    fcmpu 0, 2, 4
+; NOVSX-NEXT:    fmr 5, 2
+; NOVSX-NEXT:    bc 12, 1, .LBB7_12
+; NOVSX-NEXT:  # %bb.11: # %entry
+; NOVSX-NEXT:    fmr 5, 4
+; NOVSX-NEXT:  .LBB7_12: # %entry
+; NOVSX-NEXT:    ld 4, -8(1)
+; NOVSX-NEXT:    bc 12, 3, .LBB7_14
+; NOVSX-NEXT:  # %bb.13: # %entry
+; NOVSX-NEXT:    fmr 0, 5
+; NOVSX-NEXT:  .LBB7_14: # %entry
+; NOVSX-NEXT:    cmpdi 4, 0
+; NOVSX-NEXT:    ld 3, -16(1)
+; NOVSX-NEXT:    bc 4, 2, .LBB7_19
+; NOVSX-NEXT:  # %bb.15: # %entry
+; NOVSX-NEXT:    cmpdi 3, 0
+; NOVSX-NEXT:    bc 4, 2, .LBB7_20
+; NOVSX-NEXT:  .LBB7_16: # %entry
+; NOVSX-NEXT:    fcmpu 0, 0, 1
+; NOVSX-NEXT:    bc 12, 2, .LBB7_18
+; NOVSX-NEXT:  .LBB7_17: # %entry
+; NOVSX-NEXT:    fmr 4, 0
+; NOVSX-NEXT:  .LBB7_18: # %entry
+; NOVSX-NEXT:    fmr 1, 3
+; NOVSX-NEXT:    fmr 2, 4
+; NOVSX-NEXT:    blr
+; NOVSX-NEXT:  .LBB7_19: # %entry
+; NOVSX-NEXT:    fmr 2, 0
+; NOVSX-NEXT:    cmpdi 3, 0
+; NOVSX-NEXT:    bc 12, 2, .LBB7_16
+; NOVSX-NEXT:  .LBB7_20: # %entry
+; NOVSX-NEXT:    fmr 4, 2
+; NOVSX-NEXT:    fcmpu 0, 0, 1
+; NOVSX-NEXT:    bc 4, 2, .LBB7_17
+; NOVSX-NEXT:    b .LBB7_18
+;
+; VSX-LABEL: v2f64_maximum:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    addis 3, 2, .LCPI7_0@toc@ha
+; VSX-NEXT:    xvcmpeqdp 36, 35, 35
+; VSX-NEXT:    xvcmpeqdp 37, 34, 34
+; VSX-NEXT:    addi 3, 3, .LCPI7_0@toc@l
+; VSX-NEXT:    xxlnor 36, 36, 36
+; VSX-NEXT:    xxlnor 37, 37, 37
+; VSX-NEXT:    xvmaxdp 0, 34, 35
+; VSX-NEXT:    lxvd2x 2, 0, 3
+; VSX-NEXT:    xxlor 1, 37, 36
+; VSX-NEXT:    xxlxor 36, 36, 36
+; VSX-NEXT:    vcmpequd 5, 2, 4
+; VSX-NEXT:    xxsel 0, 0, 2, 1
+; VSX-NEXT:    xxsel 1, 0, 34, 37
+; VSX-NEXT:    vcmpequd 2, 3, 4
+; VSX-NEXT:    xxsel 1, 1, 35, 34
+; VSX-NEXT:    xvcmpeqdp 34, 0, 36
+; VSX-NEXT:    xxsel 34, 0, 1, 34
+; VSX-NEXT:    blr
+;
+; AIX-LABEL: v2f64_maximum:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    ld 3, L..C8(2) # %const.0
+; AIX-NEXT:    xvcmpeqdp 36, 35, 35
+; AIX-NEXT:    xvcmpeqdp 37, 34, 34
+; AIX-NEXT:    lxvd2x 2, 0, 3
+; AIX-NEXT:    xxlnor 36, 36, 36
+; AIX-NEXT:    xxlnor 37, 37, 37
+; AIX-NEXT:    xvmaxdp 0, 34, 35
+; AIX-NEXT:    xxlor 1, 37, 36
+; AIX-NEXT:    xxlxor 36, 36, 36
+; AIX-NEXT:    vcmpequd 5, 2, 4
+; AIX-NEXT:    xxsel 0, 0, 2, 1
+; AIX-NEXT:    xxsel 1, 0, 34, 37
+; AIX-NEXT:    vcmpequd 2, 3, 4
+; AIX-NEXT:    xxsel 1, 1, 35, 34
+; AIX-NEXT:    xvcmpeqdp 34, 0, 36
+; AIX-NEXT:    xxsel 34, 0, 1, 34
+; AIX-NEXT:    blr
+entry:
+  %m = call <2 x double> @llvm.maximum.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %m
+}
+
+declare float @llvm.maximum.f32(float, float)
+declare double @llvm.maximum.f64(double, double)
+declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
+
+declare float @llvm.minimum.f32(float, float)
+declare double @llvm.minimum.f64(double, double)
+declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index 83edd49bc963..1587f770f87c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -35,7 +35,7 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_load_v16i1_v32i1(ptr %p) {
 ; CHECK-NEXT:    vmv.v.v v0, v9
 ; CHECK-NEXT:    ret
   %vec = load <32 x i1>, ptr %p
-  %retval = call {<16 x i1>, <16 x i1>} @llvm.experimental.vector.deinterleave2.v32i1(<32 x i1> %vec)
+  %retval = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec)
   ret {<16 x i1>, <16 x i1>} %retval
 }
 
@@ -46,7 +46,7 @@ define {<16 x i8>, <16 x i8>} @vector_deinterleave_load_v16i8_v32i8(ptr %p) {
 ; CHECK-NEXT:    vlseg2e8.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <32 x i8>, ptr %p
-  %retval = call {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> %vec)
+  %retval = call {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8> %vec)
   ret {<16 x i8>, <16 x i8>} %retval
 }
 
@@ -62,7 +62,7 @@ define {<8 x i16>, <8 x i16>} @vector_deinterleave_load_v8i16_v16i16_align1(ptr
 ; CHECK-NEXT:    vnsrl.wi v9, v10, 16
 ; CHECK-NEXT:    ret
   %vec = load <16 x i16>, ptr %p, align 1
-  %retval = call {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %vec)
+  %retval = call {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16> %vec)
   ret {<8 x i16>, <8 x i16>} %retval
 }
 
@@ -73,7 +73,7 @@ define {<8 x i16>, <8 x i16>} @vector_deinterleave_load_v8i16_v16i16(ptr %p) {
 ; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <16 x i16>, ptr %p
-  %retval = call {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %vec)
+  %retval = call {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16> %vec)
   ret {<8 x i16>, <8 x i16>} %retval
 }
 
@@ -84,7 +84,7 @@ define {<4 x i32>, <4 x i32>} @vector_deinterleave_load_v4i32_vv8i32(ptr %p) {
 ; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <8 x i32>, ptr %p
-  %retval = call {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %vec)
+  %retval = call {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v8i32(<8 x i32> %vec)
   ret {<4 x i32>, <4 x i32>} %retval
 }
 
@@ -95,15 +95,15 @@ define {<2 x i64>, <2 x i64>} @vector_deinterleave_load_v2i64_v4i64(ptr %p) {
 ; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <4 x i64>, ptr %p
-  %retval = call {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %vec)
+  %retval = call {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64> %vec)
   ret {<2 x i64>, <2 x i64>} %retval
 }
 
-declare {<16 x i1>, <16 x i1>} @llvm.experimental.vector.deinterleave2.v32i1(<32 x i1>)
-declare {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8>)
-declare {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16>)
-declare {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>)
-declare {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64>)
+declare {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1>)
+declare {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8>)
+declare {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16>)
+declare {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v8i32(<8 x i32>)
+declare {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64>)
 
 ; Floats
 
@@ -114,7 +114,7 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_load_v2f16_v4f16(ptr %p) {
 ; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <4 x half>, ptr %p
-  %retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec)
+  %retval = call {<2 x half>, <2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half> %vec)
   ret {<2 x half>, <2 x half>} %retval
 }
 
@@ -125,7 +125,7 @@ define {<4 x half>, <4 x half>} @vector_deinterleave_load_v4f16_v8f16(ptr %p) {
 ; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <8 x half>, ptr %p
-  %retval = call {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %vec)
+  %retval = call {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half> %vec)
   ret {<4 x half>, <4 x half>} %retval
 }
 
@@ -136,7 +136,7 @@ define {<2 x float>, <2 x float>} @vector_deinterleave_load_v2f32_v4f32(ptr %p)
 ; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <4 x float>, ptr %p
-  %retval = call {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float> %vec)
+  %retval = call {<2 x float>, <2 x float>} @llvm.vector.deinterleave2.v4f32(<4 x float> %vec)
   ret {<2 x float>, <2 x float>} %retval
 }
 
@@ -147,7 +147,7 @@ define {<8 x half>, <8 x half>} @vector_deinterleave_load_v8f16_v16f16(ptr %p) {
 ; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <16 x half>, ptr %p
-  %retval = call {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %vec)
+  %retval = call {<8 x half>, <8 x half>} @llvm.vector.deinterleave2.v16f16(<16 x half> %vec)
   ret {<8 x half>, <8 x half>} %retval
 }
 
@@ -158,7 +158,7 @@ define {<4 x float>, <4 x float>} @vector_deinterleave_load_v4f32_v8f32(ptr %p)
 ; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <8 x float>, ptr %p
-  %retval = call {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %vec)
+  %retval = call {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float> %vec)
   ret  {<4 x float>, <4 x float>} %retval
 }
 
@@ -169,13 +169,13 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_load_v2f64_v4f64(ptr %p
 ; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <4 x double>, ptr %p
-  %retval = call {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec)
+  %retval = call {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double> %vec)
   ret {<2 x double>, <2 x double>} %retval
 }
 
-declare {<2 x half>,<2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>)
-declare {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half>)
-declare {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float>)
-declare {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half>)
-declare {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float>)
-declare {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double>)
+declare {<2 x half>,<2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half>)
+declare {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half>)
+declare {<2 x float>, <2 x float>} @llvm.vector.deinterleave2.v4f32(<4 x float>)
+declare {<8 x half>, <8 x half>} @llvm.vector.deinterleave2.v16f16(<16 x half>)
+declare {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float>)
+declare {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
index 9161cedd58e3..8de9cc25ae09 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
@@ -23,7 +23,7 @@ define void @vector_interleave_store_v32i1_v16i1(<16 x i1> %a, <16 x i1> %b, ptr
 ; CHECK-NEXT:    vmsne.vi v8, v12, 0
 ; CHECK-NEXT:    vsm.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1> %a, <16 x i1> %b)
+  %res = call <32 x i1> @llvm.vector.interleave2.v32i1(<16 x i1> %a, <16 x i1> %b)
   store <32 x i1> %res, ptr %p
   ret void
 }
@@ -40,7 +40,7 @@ define void @vector_interleave_store_v16i16_v8i16_align1(<8 x i16> %a, <8 x i16>
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vse8.v v10, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b)
+  %res = call <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b)
   store <16 x i16> %res, ptr %p, align 1
   ret void
 }
@@ -51,7 +51,7 @@ define void @vector_interleave_store_v16i16_v8i16(<8 x i16> %a, <8 x i16> %b, pt
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vsseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b)
+  %res = call <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b)
   store <16 x i16> %res, ptr %p
   ret void
 }
@@ -62,7 +62,7 @@ define void @vector_interleave_store_v8i32_v4i32(<4 x i32> %a, <4 x i32> %b, ptr
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vsseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b)
+  %res = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b)
   store <8 x i32> %res, ptr %p
   ret void
 }
@@ -73,15 +73,15 @@ define void @vector_interleave_store_v4i64_v2i64(<2 x i64> %a, <2 x i64> %b, ptr
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vsseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %a, <2 x i64> %b)
+  %res = call <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64> %a, <2 x i64> %b)
   store <4 x i64> %res, ptr %p
   ret void
 }
 
-declare <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1>, <16 x i1>)
-declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
-declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
-declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
+declare <32 x i1> @llvm.vector.interleave2.v32i1(<16 x i1>, <16 x i1>)
+declare <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
+declare <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
 
 ; Floats
 
@@ -91,7 +91,7 @@ define void @vector_interleave_store_v4f16_v2f16(<2 x half> %a, <2 x half> %b, p
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %a, <2 x half> %b)
+  %res = call <4 x half> @llvm.vector.interleave2.v4f16(<2 x half> %a, <2 x half> %b)
   store <4 x half> %res, ptr %p
   ret void
 }
@@ -102,7 +102,7 @@ define void @vector_interleave_store_v8f16_v4f16(<4 x half> %a, <4 x half> %b, p
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %a, <4 x half> %b)
+  %res = call <8 x half> @llvm.vector.interleave2.v8f16(<4 x half> %a, <4 x half> %b)
   store <8 x half> %res, ptr %p
   ret void
 }
@@ -113,7 +113,7 @@ define void @vector_interleave_store_v4f32_v2f32(<2 x float> %a, <2 x float> %b,
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float> %a, <2 x float> %b)
+  %res = call <4 x float> @llvm.vector.interleave2.v4f32(<2 x float> %a, <2 x float> %b)
   store <4 x float> %res, ptr %p
   ret void
 }
@@ -124,7 +124,7 @@ define void @vector_interleave_store_v16f16_v8f16(<8 x half> %a, <8 x half> %b,
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vsseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %a, <8 x half> %b)
+  %res = call <16 x half> @llvm.vector.interleave2.v16f16(<8 x half> %a, <8 x half> %b)
   store <16 x half> %res, ptr %p
   ret void
 }
@@ -135,7 +135,7 @@ define void @vector_interleave_store_v8f32_v4f32(<4 x float> %a, <4 x float> %b,
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vsseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b)
+  %res = call <8 x float> @llvm.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b)
   store <8 x float> %res, ptr %p
   ret void
 }
@@ -146,15 +146,15 @@ define void @vector_interleave_store_v4f64_v2f64(<2 x double> %a, <2 x double> %
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vsseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> %a, <2 x double> %b)
+  %res = call <4 x double> @llvm.vector.interleave2.v4f64(<2 x double> %a, <2 x double> %b)
   store <4 x double> %res, ptr %p
   ret void
 }
 
 
-declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>)
-declare <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half>, <4 x half>)
-declare <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float>, <2 x float>)
-declare <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half>, <8 x half>)
-declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>)
-declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>)
+declare <4 x half> @llvm.vector.interleave2.v4f16(<2 x half>, <2 x half>)
+declare <8 x half> @llvm.vector.interleave2.v8f16(<4 x half>, <4 x half>)
+declare <4 x float> @llvm.vector.interleave2.v4f32(<2 x float>, <2 x float>)
+declare <16 x half> @llvm.vector.interleave2.v16f16(<8 x half>, <8 x half>)
+declare <8 x float> @llvm.vector.interleave2.v8f32(<4 x float>, <4 x float>)
+declare <4 x double> @llvm.vector.interleave2.v4f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse-bitrotate.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse-bitrotate.ll
index d4c0477408fd..a81f740f1739 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse-bitrotate.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse-bitrotate.ll
@@ -16,8 +16,8 @@ define <256 x i1> @reverse_v256i1(<256 x i1> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v10
 ; CHECK-NEXT:    vmsne.vi v0, v12, 0
 ; CHECK-NEXT:    ret
-  %res = call <256 x i1> @llvm.experimental.vector.reverse.v256i1(<256 x i1> %a)
+  %res = call <256 x i1> @llvm.vector.reverse.v256i1(<256 x i1> %a)
   ret <256 x i1> %res
 }
 
-declare <256 x i1> @llvm.experimental.vector.reverse.v256i1(<256 x i1>)
+declare <256 x i1> @llvm.vector.reverse.v256i1(<256 x i1>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
index 8f9f1c2729fc..47d7baade8b4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
@@ -29,7 +29,7 @@ define <2 x i1> @reverse_v2i1(<2 x i1> %a) {
 ; ZVBB-NEXT:    vbrev.v v8, v0
 ; ZVBB-NEXT:    vsrl.vi v0, v8, 6
 ; ZVBB-NEXT:    ret
-  %res = call <2 x i1> @llvm.experimental.vector.reverse.v2i1(<2 x i1> %a)
+  %res = call <2 x i1> @llvm.vector.reverse.v2i1(<2 x i1> %a)
   ret <2 x i1> %res
 }
 
@@ -51,7 +51,7 @@ define <4 x i1> @reverse_v4i1(<4 x i1> %a) {
 ; ZVBB-NEXT:    vbrev.v v8, v0
 ; ZVBB-NEXT:    vsrl.vi v0, v8, 4
 ; ZVBB-NEXT:    ret
-  %res = call <4 x i1> @llvm.experimental.vector.reverse.v4i1(<4 x i1> %a)
+  %res = call <4 x i1> @llvm.vector.reverse.v4i1(<4 x i1> %a)
   ret <4 x i1> %res
 }
 
@@ -72,7 +72,7 @@ define <8 x i1> @reverse_v8i1(<8 x i1> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; ZVBB-NEXT:    vbrev.v v0, v0
 ; ZVBB-NEXT:    ret
-  %res = call <8 x i1> @llvm.experimental.vector.reverse.v8i1(<8 x i1> %a)
+  %res = call <8 x i1> @llvm.vector.reverse.v8i1(<8 x i1> %a)
   ret <8 x i1> %res
 }
 
@@ -93,7 +93,7 @@ define <16 x i1> @reverse_v16i1(<16 x i1> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; ZVBB-NEXT:    vbrev.v v0, v0
 ; ZVBB-NEXT:    ret
-  %res = call <16 x i1> @llvm.experimental.vector.reverse.v16i1(<16 x i1> %a)
+  %res = call <16 x i1> @llvm.vector.reverse.v16i1(<16 x i1> %a)
   ret <16 x i1> %res
 }
 
@@ -116,7 +116,7 @@ define <32 x i1> @reverse_v32i1(<32 x i1> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vbrev.v v0, v0
 ; ZVBB-NEXT:    ret
-  %res = call <32 x i1> @llvm.experimental.vector.reverse.v32i1(<32 x i1> %a)
+  %res = call <32 x i1> @llvm.vector.reverse.v32i1(<32 x i1> %a)
   ret <32 x i1> %res
 }
 
@@ -139,7 +139,7 @@ define <64 x i1> @reverse_v64i1(<64 x i1> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; ZVBB-NEXT:    vbrev.v v0, v0
 ; ZVBB-NEXT:    ret
-  %res = call <64 x i1> @llvm.experimental.vector.reverse.v64i1(<64 x i1> %a)
+  %res = call <64 x i1> @llvm.vector.reverse.v64i1(<64 x i1> %a)
   ret <64 x i1> %res
 }
 
@@ -156,7 +156,7 @@ define <128 x i1> @reverse_v128i1(<128 x i1> %a) {
 ; CHECK-NEXT:    vrgather.vv v24, v16, v8
 ; CHECK-NEXT:    vmsne.vi v0, v24, 0
 ; CHECK-NEXT:    ret
-  %res = call <128 x i1> @llvm.experimental.vector.reverse.v128i1(<128 x i1> %a)
+  %res = call <128 x i1> @llvm.vector.reverse.v128i1(<128 x i1> %a)
   ret <128 x i1> %res
 }
 
@@ -164,7 +164,7 @@ define <1 x i8> @reverse_v1i8(<1 x i8> %a) {
 ; CHECK-LABEL: reverse_v1i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <1 x i8> @llvm.experimental.vector.reverse.v1i8(<1 x i8> %a)
+  %res = call <1 x i8> @llvm.vector.reverse.v1i8(<1 x i8> %a)
   ret <1 x i8> %res
 }
 
@@ -182,7 +182,7 @@ define <2 x i8> @reverse_v2i8(<2 x i8> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; ZVBB-NEXT:    vrev8.v v8, v8
 ; ZVBB-NEXT:    ret
-  %res = call <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8> %a)
+  %res = call <2 x i8> @llvm.vector.reverse.v2i8(<2 x i8> %a)
   ret <2 x i8> %res
 }
 
@@ -195,7 +195,7 @@ define <4 x i8> @reverse_v4i8(<4 x i8> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <4 x i8> @llvm.experimental.vector.reverse.v4i8(<4 x i8> %a)
+  %res = call <4 x i8> @llvm.vector.reverse.v4i8(<4 x i8> %a)
   ret <4 x i8> %res
 }
 
@@ -208,7 +208,7 @@ define <8 x i8> @reverse_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <8 x i8> @llvm.experimental.vector.reverse.v8i8(<8 x i8> %a)
+  %res = call <8 x i8> @llvm.vector.reverse.v8i8(<8 x i8> %a)
   ret <8 x i8> %res
 }
 
@@ -221,7 +221,7 @@ define <16 x i8> @reverse_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8> %a)
+  %res = call <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8> %a)
   ret <16 x i8> %res
 }
 
@@ -236,7 +236,7 @@ define <32 x i8> @reverse_v32i8(<32 x i8> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <32 x i8> @llvm.experimental.vector.reverse.v32i8(<32 x i8> %a)
+  %res = call <32 x i8> @llvm.vector.reverse.v32i8(<32 x i8> %a)
   ret <32 x i8> %res
 }
 
@@ -251,7 +251,7 @@ define <64 x i8> @reverse_v64i8(<64 x i8> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <64 x i8> @llvm.experimental.vector.reverse.v64i8(<64 x i8> %a)
+  %res = call <64 x i8> @llvm.vector.reverse.v64i8(<64 x i8> %a)
   ret <64 x i8> %res
 }
 
@@ -259,7 +259,7 @@ define <1 x i16> @reverse_v1i16(<1 x i16> %a) {
 ; CHECK-LABEL: reverse_v1i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <1 x i16> @llvm.experimental.vector.reverse.v1i16(<1 x i16> %a)
+  %res = call <1 x i16> @llvm.vector.reverse.v1i16(<1 x i16> %a)
   ret <1 x i16> %res
 }
 
@@ -277,7 +277,7 @@ define <2 x i16> @reverse_v2i16(<2 x i16> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vror.vi v8, v8, 16
 ; ZVBB-NEXT:    ret
-  %res = call <2 x i16> @llvm.experimental.vector.reverse.v2i16(<2 x i16> %a)
+  %res = call <2 x i16> @llvm.vector.reverse.v2i16(<2 x i16> %a)
   ret <2 x i16> %res
 }
 
@@ -290,7 +290,7 @@ define <4 x i16> @reverse_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <4 x i16> @llvm.experimental.vector.reverse.v4i16(<4 x i16> %a)
+  %res = call <4 x i16> @llvm.vector.reverse.v4i16(<4 x i16> %a)
   ret <4 x i16> %res
 }
 
@@ -303,7 +303,7 @@ define <8 x i16> @reverse_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16> %a)
+  %res = call <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16> %a)
   ret <8 x i16> %res
 }
 
@@ -316,7 +316,7 @@ define <16 x i16> @reverse_v16i16(<16 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <16 x i16> @llvm.experimental.vector.reverse.v16i16(<16 x i16> %a)
+  %res = call <16 x i16> @llvm.vector.reverse.v16i16(<16 x i16> %a)
   ret <16 x i16> %res
 }
 
@@ -332,7 +332,7 @@ define <32 x i16> @reverse_v32i16(<32 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <32 x i16> @llvm.experimental.vector.reverse.v32i16(<32 x i16> %a)
+  %res = call <32 x i16> @llvm.vector.reverse.v32i16(<32 x i16> %a)
   ret <32 x i16> %res
 }
 
@@ -340,7 +340,7 @@ define <1 x i32> @reverse_v1i32(<1 x i32> %a) {
 ; CHECK-LABEL: reverse_v1i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <1 x i32> @llvm.experimental.vector.reverse.v1i32(<1 x i32> %a)
+  %res = call <1 x i32> @llvm.vector.reverse.v1i32(<1 x i32> %a)
   ret <1 x i32> %res
 }
 
@@ -358,7 +358,7 @@ define <2 x i32> @reverse_v2i32(<2 x i32> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; ZVBB-NEXT:    vror.vi v8, v8, 32
 ; ZVBB-NEXT:    ret
-  %res = call <2 x i32> @llvm.experimental.vector.reverse.v2i32(<2 x i32> %a)
+  %res = call <2 x i32> @llvm.vector.reverse.v2i32(<2 x i32> %a)
   ret <2 x i32> %res
 }
 
@@ -371,7 +371,7 @@ define <4 x i32> @reverse_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32> %a)
+  %res = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> %a)
   ret <4 x i32> %res
 }
 
@@ -385,7 +385,7 @@ define <8 x i32> @reverse_v8i32(<8 x i32> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32> %a)
+  %res = call <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32> %a)
   ret <8 x i32> %res
 }
 
@@ -399,7 +399,7 @@ define <16 x i32> @reverse_v16i32(<16 x i32> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <16 x i32> @llvm.experimental.vector.reverse.v16i32(<16 x i32> %a)
+  %res = call <16 x i32> @llvm.vector.reverse.v16i32(<16 x i32> %a)
   ret <16 x i32> %res
 }
 
@@ -407,7 +407,7 @@ define <1 x i64> @reverse_v1i64(<1 x i64> %a) {
 ; CHECK-LABEL: reverse_v1i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <1 x i64> @llvm.experimental.vector.reverse.v1i64(<1 x i64> %a)
+  %res = call <1 x i64> @llvm.vector.reverse.v1i64(<1 x i64> %a)
   ret <1 x i64> %res
 }
 
@@ -419,7 +419,7 @@ define <2 x i64> @reverse_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    vslideup.vi v9, v8, 1
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64> %a)
+  %res = call <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64> %a)
   ret <2 x i64> %res
 }
 
@@ -433,7 +433,7 @@ define <4 x i64> @reverse_v4i64(<4 x i64> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <4 x i64> @llvm.experimental.vector.reverse.v4i64(<4 x i64> %a)
+  %res = call <4 x i64> @llvm.vector.reverse.v4i64(<4 x i64> %a)
   ret <4 x i64> %res
 }
 
@@ -447,7 +447,7 @@ define <8 x i64> @reverse_v8i64(<8 x i64> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <8 x i64> @llvm.experimental.vector.reverse.v8i64(<8 x i64> %a)
+  %res = call <8 x i64> @llvm.vector.reverse.v8i64(<8 x i64> %a)
   ret <8 x i64> %res
 }
 
@@ -456,7 +456,7 @@ define <1 x half> @reverse_v1f16(<1 x half> %a) {
 ; CHECK-LABEL: reverse_v1f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <1 x half> @llvm.experimental.vector.reverse.v1f16(<1 x half> %a)
+  %res = call <1 x half> @llvm.vector.reverse.v1f16(<1 x half> %a)
   ret <1 x half> %res
 }
 
@@ -474,7 +474,7 @@ define <2 x half> @reverse_v2f16(<2 x half> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vror.vi v8, v8, 16
 ; ZVBB-NEXT:    ret
-  %res = call <2 x half> @llvm.experimental.vector.reverse.v2f16(<2 x half> %a)
+  %res = call <2 x half> @llvm.vector.reverse.v2f16(<2 x half> %a)
   ret <2 x half> %res
 }
 
@@ -487,7 +487,7 @@ define <4 x half> @reverse_v4f16(<4 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <4 x half> @llvm.experimental.vector.reverse.v4f16(<4 x half> %a)
+  %res = call <4 x half> @llvm.vector.reverse.v4f16(<4 x half> %a)
   ret <4 x half> %res
 }
 
@@ -500,7 +500,7 @@ define <8 x half> @reverse_v8f16(<8 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half> %a)
+  %res = call <8 x half> @llvm.vector.reverse.v8f16(<8 x half> %a)
   ret <8 x half> %res
 }
 
@@ -513,7 +513,7 @@ define <16 x half> @reverse_v16f16(<16 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <16 x half> @llvm.experimental.vector.reverse.v16f16(<16 x half> %a)
+  %res = call <16 x half> @llvm.vector.reverse.v16f16(<16 x half> %a)
   ret <16 x half> %res
 }
 
@@ -529,7 +529,7 @@ define <32 x half> @reverse_v32f16(<32 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <32 x half> @llvm.experimental.vector.reverse.v32f16(<32 x half> %a)
+  %res = call <32 x half> @llvm.vector.reverse.v32f16(<32 x half> %a)
   ret <32 x half> %res
 }
 
@@ -537,7 +537,7 @@ define <1 x float> @reverse_v1f32(<1 x float> %a) {
 ; CHECK-LABEL: reverse_v1f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <1 x float> @llvm.experimental.vector.reverse.v1f32(<1 x float> %a)
+  %res = call <1 x float> @llvm.vector.reverse.v1f32(<1 x float> %a)
   ret <1 x float> %res
 }
 
@@ -555,7 +555,7 @@ define <2 x float> @reverse_v2f32(<2 x float> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; ZVBB-NEXT:    vror.vi v8, v8, 32
 ; ZVBB-NEXT:    ret
-  %res = call <2 x float> @llvm.experimental.vector.reverse.v2f32(<2 x float> %a)
+  %res = call <2 x float> @llvm.vector.reverse.v2f32(<2 x float> %a)
   ret <2 x float> %res
 }
 
@@ -568,7 +568,7 @@ define <4 x float> @reverse_v4f32(<4 x float> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float> %a)
+  %res = call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> %a)
   ret <4 x float> %res
 }
 
@@ -582,7 +582,7 @@ define <8 x float> @reverse_v8f32(<8 x float> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <8 x float> @llvm.experimental.vector.reverse.v8f32(<8 x float> %a)
+  %res = call <8 x float> @llvm.vector.reverse.v8f32(<8 x float> %a)
   ret <8 x float> %res
 }
 
@@ -596,7 +596,7 @@ define <16 x float> @reverse_v16f32(<16 x float> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <16 x float> @llvm.experimental.vector.reverse.v16f32(<16 x float> %a)
+  %res = call <16 x float> @llvm.vector.reverse.v16f32(<16 x float> %a)
   ret <16 x float> %res
 }
 
@@ -604,7 +604,7 @@ define <1 x double> @reverse_v1f64(<1 x double> %a) {
 ; CHECK-LABEL: reverse_v1f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <1 x double> @llvm.experimental.vector.reverse.v1f64(<1 x double> %a)
+  %res = call <1 x double> @llvm.vector.reverse.v1f64(<1 x double> %a)
   ret <1 x double> %res
 }
 
@@ -616,7 +616,7 @@ define <2 x double> @reverse_v2f64(<2 x double> %a) {
 ; CHECK-NEXT:    vslideup.vi v9, v8, 1
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double> %a)
+  %res = call <2 x double> @llvm.vector.reverse.v2f64(<2 x double> %a)
   ret <2 x double> %res
 }
 
@@ -630,7 +630,7 @@ define <4 x double> @reverse_v4f64(<4 x double> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <4 x double> @llvm.experimental.vector.reverse.v4f64(<4 x double> %a)
+  %res = call <4 x double> @llvm.vector.reverse.v4f64(<4 x double> %a)
   ret <4 x double> %res
 }
 
@@ -644,7 +644,7 @@ define <8 x double> @reverse_v8f64(<8 x double> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <8 x double> @llvm.experimental.vector.reverse.v8f64(<8 x double> %a)
+  %res = call <8 x double> @llvm.vector.reverse.v8f64(<8 x double> %a)
   ret <8 x double> %res
 }
 
@@ -729,7 +729,7 @@ define <3 x i64> @reverse_v3i64(<3 x i64> %a) {
 ; RV64-ZVBB-NEXT:    vrgatherei16.vv v10, v8, v12
 ; RV64-ZVBB-NEXT:    vmv.v.v v8, v10
 ; RV64-ZVBB-NEXT:    ret
-  %res = call <3 x i64> @llvm.experimental.vector.reverse.v3i64(<3 x i64> %a)
+  %res = call <3 x i64> @llvm.vector.reverse.v3i64(<3 x i64> %a)
   ret <3 x i64> %res
 }
 
@@ -813,7 +813,7 @@ define <6 x i64> @reverse_v6i64(<6 x i64> %a) {
 ; RV64-ZVBB-NEXT:    vrgatherei16.vv v12, v8, v16
 ; RV64-ZVBB-NEXT:    vmv.v.v v8, v12
 ; RV64-ZVBB-NEXT:    ret
-  %res = call <6 x i64> @llvm.experimental.vector.reverse.v6i64(<6 x i64> %a)
+  %res = call <6 x i64> @llvm.vector.reverse.v6i64(<6 x i64> %a)
   ret <6 x i64> %res
 }
 
@@ -901,54 +901,54 @@ define <12 x i64> @reverse_v12i64(<12 x i64> %a) {
 ; RV64-ZVBB-NEXT:    vrgatherei16.vv v16, v8, v24
 ; RV64-ZVBB-NEXT:    vmv.v.v v8, v16
 ; RV64-ZVBB-NEXT:    ret
-  %res = call <12 x i64> @llvm.experimental.vector.reverse.v12i64(<12 x i64> %a)
+  %res = call <12 x i64> @llvm.vector.reverse.v12i64(<12 x i64> %a)
   ret <12 x i64> %res
 }
 
-declare <2 x i1> @llvm.experimental.vector.reverse.v2i1(<2 x i1>)
-declare <4 x i1> @llvm.experimental.vector.reverse.v4i1(<4 x i1>)
-declare <8 x i1> @llvm.experimental.vector.reverse.v8i1(<8 x i1>)
-declare <16 x i1> @llvm.experimental.vector.reverse.v16i1(<16 x i1>)
-declare <32 x i1> @llvm.experimental.vector.reverse.v32i1(<32 x i1>)
-declare <64 x i1> @llvm.experimental.vector.reverse.v64i1(<64 x i1>)
-declare <128 x i1> @llvm.experimental.vector.reverse.v128i1(<128 x i1>)
-declare <1 x i8> @llvm.experimental.vector.reverse.v1i8(<1 x i8>)
-declare <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8>)
-declare <4 x i8> @llvm.experimental.vector.reverse.v4i8(<4 x i8>)
-declare <8 x i8> @llvm.experimental.vector.reverse.v8i8(<8 x i8>)
-declare <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8>)
-declare <32 x i8> @llvm.experimental.vector.reverse.v32i8(<32 x i8>)
-declare <64 x i8> @llvm.experimental.vector.reverse.v64i8(<64 x i8>)
-declare <1 x i16> @llvm.experimental.vector.reverse.v1i16(<1 x i16>)
-declare <2 x i16> @llvm.experimental.vector.reverse.v2i16(<2 x i16>)
-declare <4 x i16> @llvm.experimental.vector.reverse.v4i16(<4 x i16>)
-declare <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16>)
-declare <16 x i16> @llvm.experimental.vector.reverse.v16i16(<16 x i16>)
-declare <32 x i16> @llvm.experimental.vector.reverse.v32i16(<32 x i16>)
-declare <1 x i32> @llvm.experimental.vector.reverse.v1i32(<1 x i32>)
-declare <2 x i32> @llvm.experimental.vector.reverse.v2i32(<2 x i32>)
-declare <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32>)
-declare <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32>)
-declare <16 x i32> @llvm.experimental.vector.reverse.v16i32(<16 x i32>)
-declare <1 x i64> @llvm.experimental.vector.reverse.v1i64(<1 x i64>)
-declare <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64>)
-declare <4 x i64> @llvm.experimental.vector.reverse.v4i64(<4 x i64>)
-declare <8 x i64> @llvm.experimental.vector.reverse.v8i64(<8 x i64>)
-declare <1 x half> @llvm.experimental.vector.reverse.v1f16(<1 x half>)
-declare <2 x half> @llvm.experimental.vector.reverse.v2f16(<2 x half>)
-declare <4 x half> @llvm.experimental.vector.reverse.v4f16(<4 x half>)
-declare <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half>)
-declare <16 x half> @llvm.experimental.vector.reverse.v16f16(<16 x half>)
-declare <32 x half> @llvm.experimental.vector.reverse.v32f16(<32 x half>)
-declare <1 x float> @llvm.experimental.vector.reverse.v1f32(<1 x float>)
-declare <2 x float> @llvm.experimental.vector.reverse.v2f32(<2 x float>)
-declare <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float>)
-declare <8 x float> @llvm.experimental.vector.reverse.v8f32(<8 x float>)
-declare <16 x float> @llvm.experimental.vector.reverse.v16f32(<16 x float>)
-declare <1 x double> @llvm.experimental.vector.reverse.v1f64(<1 x double>)
-declare <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double>)
-declare <4 x double> @llvm.experimental.vector.reverse.v4f64(<4 x double>)
-declare <8 x double> @llvm.experimental.vector.reverse.v8f64(<8 x double>)
-declare <3 x i64> @llvm.experimental.vector.reverse.v3i64(<3 x i64>)
-declare <6 x i64> @llvm.experimental.vector.reverse.v6i64(<6 x i64>)
-declare <12 x i64> @llvm.experimental.vector.reverse.v12i64(<12 x i64>)
+declare <2 x i1> @llvm.vector.reverse.v2i1(<2 x i1>)
+declare <4 x i1> @llvm.vector.reverse.v4i1(<4 x i1>)
+declare <8 x i1> @llvm.vector.reverse.v8i1(<8 x i1>)
+declare <16 x i1> @llvm.vector.reverse.v16i1(<16 x i1>)
+declare <32 x i1> @llvm.vector.reverse.v32i1(<32 x i1>)
+declare <64 x i1> @llvm.vector.reverse.v64i1(<64 x i1>)
+declare <128 x i1> @llvm.vector.reverse.v128i1(<128 x i1>)
+declare <1 x i8> @llvm.vector.reverse.v1i8(<1 x i8>)
+declare <2 x i8> @llvm.vector.reverse.v2i8(<2 x i8>)
+declare <4 x i8> @llvm.vector.reverse.v4i8(<4 x i8>)
+declare <8 x i8> @llvm.vector.reverse.v8i8(<8 x i8>)
+declare <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8>)
+declare <32 x i8> @llvm.vector.reverse.v32i8(<32 x i8>)
+declare <64 x i8> @llvm.vector.reverse.v64i8(<64 x i8>)
+declare <1 x i16> @llvm.vector.reverse.v1i16(<1 x i16>)
+declare <2 x i16> @llvm.vector.reverse.v2i16(<2 x i16>)
+declare <4 x i16> @llvm.vector.reverse.v4i16(<4 x i16>)
+declare <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16>)
+declare <16 x i16> @llvm.vector.reverse.v16i16(<16 x i16>)
+declare <32 x i16> @llvm.vector.reverse.v32i16(<32 x i16>)
+declare <1 x i32> @llvm.vector.reverse.v1i32(<1 x i32>)
+declare <2 x i32> @llvm.vector.reverse.v2i32(<2 x i32>)
+declare <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32>)
+declare <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32>)
+declare <16 x i32> @llvm.vector.reverse.v16i32(<16 x i32>)
+declare <1 x i64> @llvm.vector.reverse.v1i64(<1 x i64>)
+declare <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64>)
+declare <4 x i64> @llvm.vector.reverse.v4i64(<4 x i64>)
+declare <8 x i64> @llvm.vector.reverse.v8i64(<8 x i64>)
+declare <1 x half> @llvm.vector.reverse.v1f16(<1 x half>)
+declare <2 x half> @llvm.vector.reverse.v2f16(<2 x half>)
+declare <4 x half> @llvm.vector.reverse.v4f16(<4 x half>)
+declare <8 x half> @llvm.vector.reverse.v8f16(<8 x half>)
+declare <16 x half> @llvm.vector.reverse.v16f16(<16 x half>)
+declare <32 x half> @llvm.vector.reverse.v32f16(<32 x half>)
+declare <1 x float> @llvm.vector.reverse.v1f32(<1 x float>)
+declare <2 x float> @llvm.vector.reverse.v2f32(<2 x float>)
+declare <4 x float> @llvm.vector.reverse.v4f32(<4 x float>)
+declare <8 x float> @llvm.vector.reverse.v8f32(<8 x float>)
+declare <16 x float> @llvm.vector.reverse.v16f32(<16 x float>)
+declare <1 x double> @llvm.vector.reverse.v1f64(<1 x double>)
+declare <2 x double> @llvm.vector.reverse.v2f64(<2 x double>)
+declare <4 x double> @llvm.vector.reverse.v4f64(<4 x double>)
+declare <8 x double> @llvm.vector.reverse.v8f64(<8 x double>)
+declare <3 x i64> @llvm.vector.reverse.v3i64(<3 x i64>)
+declare <6 x i64> @llvm.vector.reverse.v6i64(<6 x i64>)
+declare <12 x i64> @llvm.vector.reverse.v12i64(<12 x i64>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
index 4e08f401ca4e..96094eea631b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
@@ -104,7 +104,7 @@ define <vscale x 2 x i1> @reverse_nxv2i1(<vscale x 2 x i1> %a) {
 ; RV64-BITS-512-NEXT:    vand.vi v8, v10, 1
 ; RV64-BITS-512-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %a)
+  %res = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %a)
   ret <vscale x 2 x i1> %res
 }
 
@@ -202,7 +202,7 @@ define <vscale x 4 x i1> @reverse_nxv4i1(<vscale x 4 x i1> %a) {
 ; RV64-BITS-512-NEXT:    vand.vi v8, v10, 1
 ; RV64-BITS-512-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %res = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
   ret <vscale x 4 x i1> %res
 }
 
@@ -294,7 +294,7 @@ define <vscale x 8 x i1> @reverse_nxv8i1(<vscale x 8 x i1> %a) {
 ; RV64-BITS-512-NEXT:    vand.vi v8, v10, 1
 ; RV64-BITS-512-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %a)
+  %res = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %a)
   ret <vscale x 8 x i1> %res
 }
 
@@ -392,7 +392,7 @@ define <vscale x 16 x i1> @reverse_nxv16i1(<vscale x 16 x i1> %a) {
 ; RV64-BITS-512-NEXT:    vand.vi v8, v12, 1
 ; RV64-BITS-512-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %a)
+  %res = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %a)
   ret <vscale x 16 x i1> %res
 }
 
@@ -490,7 +490,7 @@ define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) {
 ; RV64-BITS-512-NEXT:    vand.vi v8, v16, 1
 ; RV64-BITS-512-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 32 x i1> @llvm.experimental.vector.reverse.nxv32i1(<vscale x 32 x i1> %a)
+  %res = call <vscale x 32 x i1> @llvm.vector.reverse.nxv32i1(<vscale x 32 x i1> %a)
   ret <vscale x 32 x i1> %res
 }
 
@@ -600,7 +600,7 @@ define <vscale x 64 x i1> @reverse_nxv64i1(<vscale x 64 x i1> %a) {
 ; RV64-BITS-512-NEXT:    vand.vi v8, v24, 1
 ; RV64-BITS-512-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 64 x i1> @llvm.experimental.vector.reverse.nxv64i1(<vscale x 64 x i1> %a)
+  %res = call <vscale x 64 x i1> @llvm.vector.reverse.nxv64i1(<vscale x 64 x i1> %a)
   ret <vscale x 64 x i1> %res
 }
 
@@ -682,7 +682,7 @@ define <vscale x 1 x i8> @reverse_nxv1i8(<vscale x 1 x i8> %a) {
 ; RV64-BITS-512-NEXT:    vrgather.vv v9, v8, v10
 ; RV64-BITS-512-NEXT:    vmv1r.v v8, v9
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 1 x i8> @llvm.experimental.vector.reverse.nxv1i8(<vscale x 1 x i8> %a)
+  %res = call <vscale x 1 x i8> @llvm.vector.reverse.nxv1i8(<vscale x 1 x i8> %a)
   ret <vscale x 1 x i8> %res
 }
 
@@ -760,7 +760,7 @@ define <vscale x 2 x i8> @reverse_nxv2i8(<vscale x 2 x i8> %a) {
 ; RV64-BITS-512-NEXT:    vrgather.vv v9, v8, v10
 ; RV64-BITS-512-NEXT:    vmv1r.v v8, v9
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 2 x i8> @llvm.experimental.vector.reverse.nxv2i8(<vscale x 2 x i8> %a)
+  %res = call <vscale x 2 x i8> @llvm.vector.reverse.nxv2i8(<vscale x 2 x i8> %a)
   ret <vscale x 2 x i8> %res
 }
 
@@ -838,7 +838,7 @@ define <vscale x 4 x i8> @reverse_nxv4i8(<vscale x 4 x i8> %a) {
 ; RV64-BITS-512-NEXT:    vrgather.vv v9, v8, v10
 ; RV64-BITS-512-NEXT:    vmv1r.v v8, v9
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 4 x i8> @llvm.experimental.vector.reverse.nxv4i8(<vscale x 4 x i8> %a)
+  %res = call <vscale x 4 x i8> @llvm.vector.reverse.nxv4i8(<vscale x 4 x i8> %a)
   ret <vscale x 4 x i8> %res
 }
 
@@ -910,7 +910,7 @@ define <vscale x 8 x i8> @reverse_nxv8i8(<vscale x 8 x i8> %a) {
 ; RV64-BITS-512-NEXT:    vrgather.vv v9, v8, v10
 ; RV64-BITS-512-NEXT:    vmv.v.v v8, v9
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 8 x i8> @llvm.experimental.vector.reverse.nxv8i8(<vscale x 8 x i8> %a)
+  %res = call <vscale x 8 x i8> @llvm.vector.reverse.nxv8i8(<vscale x 8 x i8> %a)
   ret <vscale x 8 x i8> %res
 }
 
@@ -988,7 +988,7 @@ define <vscale x 16 x i8> @reverse_nxv16i8(<vscale x 16 x i8> %a) {
 ; RV64-BITS-512-NEXT:    vrgather.vv v10, v8, v12
 ; RV64-BITS-512-NEXT:    vmv.v.v v8, v10
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> %a)
+  %res = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> %a)
   ret <vscale x 16 x i8> %res
 }
 
@@ -1066,7 +1066,7 @@ define <vscale x 32 x i8> @reverse_nxv32i8(<vscale x 32 x i8> %a) {
 ; RV64-BITS-512-NEXT:    vrgather.vv v12, v8, v16
 ; RV64-BITS-512-NEXT:    vmv.v.v v8, v12
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> %a)
+  %res = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> %a)
   ret <vscale x 32 x i8> %res
 }
 
@@ -1148,7 +1148,7 @@ define <vscale x 64 x i8> @reverse_nxv64i8(<vscale x 64 x i8> %a) {
 ; RV64-BITS-512-NEXT:    vrgather.vv v16, v12, v24
 ; RV64-BITS-512-NEXT:    vmv8r.v v8, v16
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 64 x i8> @llvm.experimental.vector.reverse.nxv64i8(<vscale x 64 x i8> %a)
+  %res = call <vscale x 64 x i8> @llvm.vector.reverse.nxv64i8(<vscale x 64 x i8> %a)
   ret <vscale x 64 x i8> %res
 }
 
@@ -1164,7 +1164,7 @@ define <vscale x 1 x i16> @reverse_nxv1i16(<vscale x 1 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i16> @llvm.experimental.vector.reverse.nxv1i16(<vscale x 1 x i16> %a)
+  %res = call <vscale x 1 x i16> @llvm.vector.reverse.nxv1i16(<vscale x 1 x i16> %a)
   ret <vscale x 1 x i16> %res
 }
 
@@ -1180,7 +1180,7 @@ define <vscale x 2 x i16> @reverse_nxv2i16(<vscale x 2 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> %a)
+  %res = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> %a)
   ret <vscale x 2 x i16> %res
 }
 
@@ -1196,7 +1196,7 @@ define <vscale x 4 x i16> @reverse_nxv4i16(<vscale x 4 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> %a)
+  %res = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> %a)
   ret <vscale x 4 x i16> %res
 }
 
@@ -1211,7 +1211,7 @@ define <vscale x 8 x i16> @reverse_nxv8i16(<vscale x 8 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> %a)
+  %res = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> %a)
   ret <vscale x 8 x i16> %res
 }
 
@@ -1227,7 +1227,7 @@ define <vscale x 16 x i16> @reverse_nxv16i16(<vscale x 16 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> %a)
+  %res = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> %a)
   ret <vscale x 16 x i16> %res
 }
 
@@ -1243,7 +1243,7 @@ define <vscale x 32 x i16> @reverse_nxv32i16(<vscale x 32 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v16, v8, v24
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i16> @llvm.experimental.vector.reverse.nxv32i16(<vscale x 32 x i16> %a)
+  %res = call <vscale x 32 x i16> @llvm.vector.reverse.nxv32i16(<vscale x 32 x i16> %a)
   ret <vscale x 32 x i16> %res
 }
 
@@ -1259,7 +1259,7 @@ define <vscale x 1 x i32> @reverse_nxv1i32(<vscale x 1 x i32> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i32> @llvm.experimental.vector.reverse.nxv1i32(<vscale x 1 x i32> %a)
+  %res = call <vscale x 1 x i32> @llvm.vector.reverse.nxv1i32(<vscale x 1 x i32> %a)
   ret <vscale x 1 x i32> %res
 }
 
@@ -1275,7 +1275,7 @@ define <vscale x 2 x i32> @reverse_nxv2i32(<vscale x 2 x i32> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i32> @llvm.experimental.vector.reverse.nxv2i32(<vscale x 2 x i32> %a)
+  %res = call <vscale x 2 x i32> @llvm.vector.reverse.nxv2i32(<vscale x 2 x i32> %a)
   ret <vscale x 2 x i32> %res
 }
 
@@ -1291,7 +1291,7 @@ define <vscale x 4 x i32> @reverse_nxv4i32(<vscale x 4 x i32> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %res = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   ret <vscale x 4 x i32> %res
 }
 
@@ -1306,7 +1306,7 @@ define <vscale x 8 x i32> @reverse_nxv8i32(<vscale x 8 x i32> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> %a)
+  %res = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> %a)
   ret <vscale x 8 x i32> %res
 }
 
@@ -1322,7 +1322,7 @@ define <vscale x 16 x i32> @reverse_nxv16i32(<vscale x 16 x i32> %a) {
 ; CHECK-NEXT:    vrgather.vv v16, v8, v24
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i32> @llvm.experimental.vector.reverse.nxv16i32(<vscale x 16 x i32> %a)
+  %res = call <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32> %a)
   ret <vscale x 16 x i32> %res
 }
 
@@ -1338,7 +1338,7 @@ define <vscale x 1 x i64> @reverse_nxv1i64(<vscale x 1 x i64> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i64> @llvm.experimental.vector.reverse.nxv1i64(<vscale x 1 x i64> %a)
+  %res = call <vscale x 1 x i64> @llvm.vector.reverse.nxv1i64(<vscale x 1 x i64> %a)
   ret <vscale x 1 x i64> %res
 }
 
@@ -1354,7 +1354,7 @@ define <vscale x 2 x i64> @reverse_nxv2i64(<vscale x 2 x i64> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> %a)
+  %res = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> %a)
   ret <vscale x 2 x i64> %res
 }
 
@@ -1370,7 +1370,7 @@ define <vscale x 4 x i64> @reverse_nxv4i64(<vscale x 4 x i64> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> %a)
+  %res = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> %a)
   ret <vscale x 4 x i64> %res
 }
 
@@ -1385,7 +1385,7 @@ define <vscale x 8 x i64> @reverse_nxv8i64(<vscale x 8 x i64> %a) {
 ; CHECK-NEXT:    vrgather.vv v16, v8, v24
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64> %a)
+  %res = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> %a)
   ret <vscale x 8 x i64> %res
 }
 
@@ -1405,7 +1405,7 @@ define <vscale x 1 x half> @reverse_nxv1f16(<vscale x 1 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x half> @llvm.experimental.vector.reverse.nxv1f16(<vscale x 1 x half> %a)
+  %res = call <vscale x 1 x half> @llvm.vector.reverse.nxv1f16(<vscale x 1 x half> %a)
   ret <vscale x 1 x half> %res
 }
 
@@ -1421,7 +1421,7 @@ define <vscale x 2 x half> @reverse_nxv2f16(<vscale x 2 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.reverse.nxv2f16(<vscale x 2 x half> %a)
+  %res = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> %a)
   ret <vscale x 2 x half> %res
 }
 
@@ -1437,7 +1437,7 @@ define <vscale x 4 x half> @reverse_nxv4f16(<vscale x 4 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.reverse.nxv4f16(<vscale x 4 x half> %a)
+  %res = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> %a)
   ret <vscale x 4 x half> %res
 }
 
@@ -1452,7 +1452,7 @@ define <vscale x 8 x half> @reverse_nxv8f16(<vscale x 8 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half> %a)
+  %res = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> %a)
   ret <vscale x 8 x half> %res
 }
 
@@ -1468,7 +1468,7 @@ define <vscale x 16 x half> @reverse_nxv16f16(<vscale x 16 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x half> @llvm.experimental.vector.reverse.nxv16f16(<vscale x 16 x half> %a)
+  %res = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> %a)
   ret <vscale x 16 x half> %res
 }
 
@@ -1484,7 +1484,7 @@ define <vscale x 32 x half> @reverse_nxv32f16(<vscale x 32 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v16, v8, v24
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x half> @llvm.experimental.vector.reverse.nxv32f16(<vscale x 32 x half> %a)
+  %res = call <vscale x 32 x half> @llvm.vector.reverse.nxv32f16(<vscale x 32 x half> %a)
   ret <vscale x 32 x half> %res
 }
 
@@ -1500,7 +1500,7 @@ define <vscale x 1 x float> @reverse_nxv1f32(<vscale x 1 x float> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x float> @llvm.experimental.vector.reverse.nxv1f32(<vscale x 1 x float> %a)
+  %res = call <vscale x 1 x float> @llvm.vector.reverse.nxv1f32(<vscale x 1 x float> %a)
   ret <vscale x 1 x float> %res
 }
 
@@ -1516,7 +1516,7 @@ define <vscale x 2 x float> @reverse_nxv2f32(<vscale x 2 x float> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.reverse.nxv2f32(<vscale x 2 x float> %a)
+  %res = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> %a)
   ret <vscale x 2 x float> %res
 }
 
@@ -1532,7 +1532,7 @@ define <vscale x 4 x float> @reverse_nxv4f32(<vscale x 4 x float> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
+  %res = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
   ret <vscale x 4 x float> %res
 }
 
@@ -1547,7 +1547,7 @@ define <vscale x 8 x float> @reverse_nxv8f32(<vscale x 8 x float> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x float> @llvm.experimental.vector.reverse.nxv8f32(<vscale x 8 x float> %a)
+  %res = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> %a)
   ret <vscale x 8 x float> %res
 }
 
@@ -1563,7 +1563,7 @@ define <vscale x 16 x float> @reverse_nxv16f32(<vscale x 16 x float> %a) {
 ; CHECK-NEXT:    vrgather.vv v16, v8, v24
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x float> @llvm.experimental.vector.reverse.nxv16f32(<vscale x 16 x float> %a)
+  %res = call <vscale x 16 x float> @llvm.vector.reverse.nxv16f32(<vscale x 16 x float> %a)
   ret <vscale x 16 x float> %res
 }
 
@@ -1579,7 +1579,7 @@ define <vscale x 1 x double> @reverse_nxv1f64(<vscale x 1 x double> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x double> @llvm.experimental.vector.reverse.nxv1f64(<vscale x 1 x double> %a)
+  %res = call <vscale x 1 x double> @llvm.vector.reverse.nxv1f64(<vscale x 1 x double> %a)
   ret <vscale x 1 x double> %res
 }
 
@@ -1595,7 +1595,7 @@ define <vscale x 2 x double> @reverse_nxv2f64(<vscale x 2 x double> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> %a)
+  %res = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> %a)
   ret <vscale x 2 x double> %res
 }
 
@@ -1611,7 +1611,7 @@ define <vscale x 4 x double> @reverse_nxv4f64(<vscale x 4 x double> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double> %a)
+  %res = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %a)
   ret <vscale x 4 x double> %res
 }
 
@@ -1626,7 +1626,7 @@ define <vscale x 8 x double> @reverse_nxv8f64(<vscale x 8 x double> %a) {
 ; CHECK-NEXT:    vrgather.vv v16, v8, v24
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x double> @llvm.experimental.vector.reverse.nxv8f64(<vscale x 8 x double> %a)
+  %res = call <vscale x 8 x double> @llvm.vector.reverse.nxv8f64(<vscale x 8 x double> %a)
   ret <vscale x 8 x double> %res
 }
 
@@ -1646,7 +1646,7 @@ define <vscale x 3 x i64> @reverse_nxv3i64(<vscale x 3 x i64> %a) {
 ; CHECK-NEXT:    vmv1r.v v9, v18
 ; CHECK-NEXT:    vmv1r.v v10, v19
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 3 x i64> @llvm.experimental.vector.reverse.nxv3i64(<vscale x 3 x i64> %a)
+  %res = call <vscale x 3 x i64> @llvm.vector.reverse.nxv3i64(<vscale x 3 x i64> %a)
   ret <vscale x 3 x i64> %res
 }
 
@@ -1663,7 +1663,7 @@ define <vscale x 6 x i64> @reverse_nxv6i64(<vscale x 6 x i64> %a) {
 ; CHECK-NEXT:    vmv2r.v v10, v28
 ; CHECK-NEXT:    vmv2r.v v12, v30
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 6 x i64> @llvm.experimental.vector.reverse.nxv6i64(<vscale x 6 x i64> %a)
+  %res = call <vscale x 6 x i64> @llvm.vector.reverse.nxv6i64(<vscale x 6 x i64> %a)
   ret <vscale x 6 x i64> %res
 }
 
@@ -1739,53 +1739,53 @@ define <vscale x 12 x i64> @reverse_nxv12i64(<vscale x 12 x i64> %a) {
 ; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 80
 ; RV64-NEXT:    ret
-  %res = call <vscale x 12 x i64> @llvm.experimental.vector.reverse.nxv12i64(<vscale x 12 x i64> %a)
+  %res = call <vscale x 12 x i64> @llvm.vector.reverse.nxv12i64(<vscale x 12 x i64> %a)
   ret <vscale x 12 x i64> %res
 }
 
-declare <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1>)
-declare <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1>)
-declare <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1>)
-declare <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1>)
-declare <vscale x 32 x i1> @llvm.experimental.vector.reverse.nxv32i1(<vscale x 32 x i1>)
-declare <vscale x 64 x i1> @llvm.experimental.vector.reverse.nxv64i1(<vscale x 64 x i1>)
-declare <vscale x 1 x i8> @llvm.experimental.vector.reverse.nxv1i8(<vscale x 1 x i8>)
-declare <vscale x 2 x i8> @llvm.experimental.vector.reverse.nxv2i8(<vscale x 2 x i8>)
-declare <vscale x 4 x i8> @llvm.experimental.vector.reverse.nxv4i8(<vscale x 4 x i8>)
-declare <vscale x 8 x i8> @llvm.experimental.vector.reverse.nxv8i8(<vscale x 8 x i8>)
-declare <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8>)
-declare <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8>)
-declare <vscale x 64 x i8> @llvm.experimental.vector.reverse.nxv64i8(<vscale x 64 x i8>)
-declare <vscale x 1 x i16> @llvm.experimental.vector.reverse.nxv1i16(<vscale x 1 x i16>)
-declare <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16>)
-declare <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16>)
-declare <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16>)
-declare <vscale x 32 x i16> @llvm.experimental.vector.reverse.nxv32i16(<vscale x 32 x i16>)
-declare <vscale x 1 x i32> @llvm.experimental.vector.reverse.nxv1i32(<vscale x 1 x i32>)
-declare <vscale x 2 x i32> @llvm.experimental.vector.reverse.nxv2i32(<vscale x 2 x i32>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32>)
-declare <vscale x 16 x i32> @llvm.experimental.vector.reverse.nxv16i32(<vscale x 16 x i32>)
-declare <vscale x 1 x i64> @llvm.experimental.vector.reverse.nxv1i64(<vscale x 1 x i64>)
-declare <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64>)
-declare <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64>)
-declare <vscale x 1 x half> @llvm.experimental.vector.reverse.nxv1f16(<vscale x 1 x half>)
-declare <vscale x 2 x half> @llvm.experimental.vector.reverse.nxv2f16(<vscale x 2 x half>)
-declare <vscale x 4 x half> @llvm.experimental.vector.reverse.nxv4f16(<vscale x 4 x half>)
-declare <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half>)
-declare <vscale x 16 x half> @llvm.experimental.vector.reverse.nxv16f16(<vscale x 16 x half>)
-declare <vscale x 32 x half> @llvm.experimental.vector.reverse.nxv32f16(<vscale x 32 x half>)
-declare <vscale x 1 x float> @llvm.experimental.vector.reverse.nxv1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.experimental.vector.reverse.nxv2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.experimental.vector.reverse.nxv8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.experimental.vector.reverse.nxv16f32(<vscale x 16 x float>)
-declare <vscale x 1 x double> @llvm.experimental.vector.reverse.nxv1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.experimental.vector.reverse.nxv8f64(<vscale x 8 x double>)
-declare <vscale x 3 x i64> @llvm.experimental.vector.reverse.nxv3i64(<vscale x 3 x i64>)
-declare <vscale x 6 x i64> @llvm.experimental.vector.reverse.nxv6i64(<vscale x 6 x i64>)
-declare <vscale x 12 x i64> @llvm.experimental.vector.reverse.nxv12i64(<vscale x 12 x i64>)
+declare <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1>)
+declare <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1>)
+declare <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1>)
+declare <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1>)
+declare <vscale x 32 x i1> @llvm.vector.reverse.nxv32i1(<vscale x 32 x i1>)
+declare <vscale x 64 x i1> @llvm.vector.reverse.nxv64i1(<vscale x 64 x i1>)
+declare <vscale x 1 x i8> @llvm.vector.reverse.nxv1i8(<vscale x 1 x i8>)
+declare <vscale x 2 x i8> @llvm.vector.reverse.nxv2i8(<vscale x 2 x i8>)
+declare <vscale x 4 x i8> @llvm.vector.reverse.nxv4i8(<vscale x 4 x i8>)
+declare <vscale x 8 x i8> @llvm.vector.reverse.nxv8i8(<vscale x 8 x i8>)
+declare <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8>)
+declare <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8>)
+declare <vscale x 64 x i8> @llvm.vector.reverse.nxv64i8(<vscale x 64 x i8>)
+declare <vscale x 1 x i16> @llvm.vector.reverse.nxv1i16(<vscale x 1 x i16>)
+declare <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16>)
+declare <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16>)
+declare <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16>)
+declare <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16>)
+declare <vscale x 32 x i16> @llvm.vector.reverse.nxv32i16(<vscale x 32 x i16>)
+declare <vscale x 1 x i32> @llvm.vector.reverse.nxv1i32(<vscale x 1 x i32>)
+declare <vscale x 2 x i32> @llvm.vector.reverse.nxv2i32(<vscale x 2 x i32>)
+declare <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32>)
+declare <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32>)
+declare <vscale x 1 x i64> @llvm.vector.reverse.nxv1i64(<vscale x 1 x i64>)
+declare <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64>)
+declare <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64>)
+declare <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64>)
+declare <vscale x 1 x half> @llvm.vector.reverse.nxv1f16(<vscale x 1 x half>)
+declare <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half>)
+declare <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half>)
+declare <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half>)
+declare <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half>)
+declare <vscale x 32 x half> @llvm.vector.reverse.nxv32f16(<vscale x 32 x half>)
+declare <vscale x 1 x float> @llvm.vector.reverse.nxv1f32(<vscale x 1 x float>)
+declare <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float>)
+declare <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float>)
+declare <vscale x 16 x float> @llvm.vector.reverse.nxv16f32(<vscale x 16 x float>)
+declare <vscale x 1 x double> @llvm.vector.reverse.nxv1f64(<vscale x 1 x double>)
+declare <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 8 x double> @llvm.vector.reverse.nxv8f64(<vscale x 8 x double>)
+declare <vscale x 3 x i64> @llvm.vector.reverse.nxv3i64(<vscale x 3 x i64>)
+declare <vscale x 6 x i64> @llvm.vector.reverse.nxv6i64(<vscale x 6 x i64>)
+declare <vscale x 12 x i64> @llvm.vector.reverse.nxv12i64(<vscale x 12 x i64>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index f3c70ed78c74..d02fe5b205f7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -31,7 +31,7 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_v16i1_v32i1(<32 x i1> %vec) {
 ; CHECK-NEXT:    vmsne.vi v8, v13, 0
 ; CHECK-NEXT:    vmv.v.v v0, v9
 ; CHECK-NEXT:    ret
-%retval = call {<16 x i1>, <16 x i1>} @llvm.experimental.vector.deinterleave2.v32i1(<32 x i1> %vec)
+%retval = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec)
 ret {<16 x i1>, <16 x i1>} %retval
 }
 
@@ -44,7 +44,7 @@ define {<16 x i8>, <16 x i8>} @vector_deinterleave_v16i8_v32i8(<32 x i8> %vec) {
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    vmv.v.v v9, v11
 ; CHECK-NEXT:    ret
-%retval = call {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> %vec)
+%retval = call {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8> %vec)
 ret {<16 x i8>, <16 x i8>} %retval
 }
 
@@ -57,7 +57,7 @@ define {<8 x i16>, <8 x i16>} @vector_deinterleave_v8i16_v16i16(<16 x i16> %vec)
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    vmv.v.v v9, v11
 ; CHECK-NEXT:    ret
-%retval = call {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %vec)
+%retval = call {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16> %vec)
 ret {<8 x i16>, <8 x i16>} %retval
 }
 
@@ -71,7 +71,7 @@ define {<4 x i32>, <4 x i32>} @vector_deinterleave_v4i32_vv8i32(<8 x i32> %vec)
 ; CHECK-NEXT:    vmv.v.v v8, v11
 ; CHECK-NEXT:    vmv.v.v v9, v10
 ; CHECK-NEXT:    ret
-%retval = call {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %vec)
+%retval = call {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v8i32(<8 x i32> %vec)
 ret {<4 x i32>, <4 x i32>} %retval
 }
 
@@ -87,15 +87,15 @@ define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) {
 ; CHECK-NEXT:    vmerge.vvm v9, v9, v10, v0
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-%retval = call {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %vec)
+%retval = call {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64> %vec)
 ret {<2 x i64>, <2 x i64>} %retval
 }
 
-declare {<16 x i1>, <16 x i1>} @llvm.experimental.vector.deinterleave2.v32i1(<32 x i1>)
-declare {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8>)
-declare {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16>)
-declare {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>)
-declare {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64>)
+declare {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1>)
+declare {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8>)
+declare {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16>)
+declare {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v8i32(<8 x i32>)
+declare {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64>)
 
 ; Floats
 
@@ -107,7 +107,7 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec
 ; CHECK-NEXT:    vnsrl.wi v9, v8, 16
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
-%retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec)
+%retval = call {<2 x half>, <2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half> %vec)
 ret {<2 x half>, <2 x half>} %retval
 }
 
@@ -119,7 +119,7 @@ define {<4 x half>, <4 x half>} @vector_deinterleave_v4f16_v8f16(<8 x half> %vec
 ; CHECK-NEXT:    vnsrl.wi v9, v8, 16
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
-%retval = call {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %vec)
+%retval = call {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half> %vec)
 ret {<4 x half>, <4 x half>} %retval
 }
 
@@ -131,7 +131,7 @@ define {<2 x float>, <2 x float>} @vector_deinterleave_v2f32_v4f32(<4 x float> %
 ; CHECK-NEXT:    vnsrl.wx v9, v8, a0
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    ret
-%retval = call {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float> %vec)
+%retval = call {<2 x float>, <2 x float>} @llvm.vector.deinterleave2.v4f32(<4 x float> %vec)
 ret {<2 x float>, <2 x float>} %retval
 }
 
@@ -144,7 +144,7 @@ define {<8 x half>, <8 x half>} @vector_deinterleave_v8f16_v16f16(<16 x half> %v
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    vmv.v.v v9, v11
 ; CHECK-NEXT:    ret
-%retval = call {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %vec)
+%retval = call {<8 x half>, <8 x half>} @llvm.vector.deinterleave2.v16f16(<16 x half> %vec)
 ret {<8 x half>, <8 x half>} %retval
 }
 
@@ -158,7 +158,7 @@ define {<4 x float>, <4 x float>} @vector_deinterleave_v4f32_v8f32(<8 x float> %
 ; CHECK-NEXT:    vmv.v.v v8, v11
 ; CHECK-NEXT:    vmv.v.v v9, v10
 ; CHECK-NEXT:    ret
-%retval = call {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %vec)
+%retval = call {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float> %vec)
 ret  {<4 x float>, <4 x float>} %retval
 }
 
@@ -174,13 +174,13 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double
 ; CHECK-NEXT:    vmerge.vvm v9, v9, v10, v0
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-%retval = call {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec)
+%retval = call {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double> %vec)
 ret {<2 x double>, <2 x double>} %retval
 }
 
-declare {<2 x half>,<2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>)
-declare {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half>)
-declare {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float>)
-declare {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half>)
-declare {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float>)
-declare {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double>)
+declare {<2 x half>,<2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half>)
+declare {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half>)
+declare {<2 x float>, <2 x float>} @llvm.vector.deinterleave2.v4f32(<4 x float>)
+declare {<8 x half>, <8 x half>} @llvm.vector.deinterleave2.v16f16(<16 x half>)
+declare {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float>)
+declare {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index 6a712080fda7..8f4ff37fffb0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -24,7 +24,7 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_load_nxv16i
 ; CHECK-NEXT:    vmsne.vi v9, v10, 0
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 32 x i1>, ptr %p
-  %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.experimental.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
+  %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
   ret {<vscale x 16 x i1>, <vscale x 16 x i1>} %retval
 }
 
@@ -35,7 +35,7 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_load_nxv16i
 ; CHECK-NEXT:    vlseg2e8.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 32 x i8>, ptr %p
-  %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
+  %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
   ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %retval
 }
 
@@ -49,7 +49,7 @@ define {<vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_load_nxv8i1
 ; CHECK-NEXT:    vnsrl.wi v10, v12, 16
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 16 x i16>, ptr %p, align 1
-  %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
+  %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
   ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %retval
 }
 
@@ -60,7 +60,7 @@ define {<vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_load_nxv8i1
 ; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 16 x i16>, ptr %p
-  %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
+  %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
   ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %retval
 }
 
@@ -71,7 +71,7 @@ define {<vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_load_nxv4i3
 ; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 8 x i32>, ptr %p
-  %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec)
+  %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec)
   ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %retval
 }
 
@@ -82,7 +82,7 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_load_nxv2i6
 ; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 4 x i64>, ptr %p
-  %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
+  %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
   ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %retval
 }
 
@@ -93,7 +93,7 @@ define {<vscale x 4 x i64>, <vscale x 4 x i64>} @vector_deinterleave_load_nxv4i6
 ; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 8 x i64>, ptr %p
-  %retval = call {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %vec)
+  %retval = call {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %vec)
   ret {<vscale x 4 x i64>, <vscale x 4 x i64>} %retval
 }
 
@@ -171,17 +171,17 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_load_nxv8i6
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 16 x i64>, ptr %p
-  %retval = call {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.experimental.vector.deinterleave2.nxv16i64(<vscale x 16 x i64> %vec)
+  %retval = call {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64> %vec)
   ret {<vscale x 8 x i64>, <vscale x 8 x i64>} %retval
 }
 
-declare {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.experimental.vector.deinterleave2.nxv32i1(<vscale x 32 x i1>)
-declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
-declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
-declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
-declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
-declare {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
-declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.experimental.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>)
+declare {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1>)
+declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
+declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
+declare {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
+declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>)
 
 ; Floats
 
@@ -192,7 +192,7 @@ define {<vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_load_nxv2
 ; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 4 x half>, ptr %p
-  %retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
+  %retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
   ret {<vscale x 2 x half>, <vscale x 2 x half>} %retval
 }
 
@@ -203,7 +203,7 @@ define {<vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_load_nxv4
 ; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 8 x half>, ptr %p
-  %retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec)
+  %retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec)
   ret {<vscale x 4 x half>, <vscale x 4 x half>} %retval
 }
 
@@ -214,7 +214,7 @@ define {<vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_load_nx
 ; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 4 x float>, ptr %p
-  %retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec)
+  %retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec)
   ret {<vscale x 2 x float>, <vscale x 2 x float>} %retval
 }
 
@@ -225,7 +225,7 @@ define {<vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_load_nxv8
 ; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 16 x half>, ptr %p
-  %retval = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %vec)
+  %retval = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %vec)
   ret {<vscale x 8 x half>, <vscale x 8 x half>} %retval
 }
 
@@ -236,7 +236,7 @@ define {<vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_load_nx
 ; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 8 x float>, ptr %p
-  %retval = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %vec)
+  %retval = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %vec)
   ret  {<vscale x 4 x float>, <vscale x 4 x float>} %retval
 }
 
@@ -247,13 +247,13 @@ define {<vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_load_
 ; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 4 x double>, ptr %p
-  %retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
+  %retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
   ret {<vscale x 2 x double>, <vscale x 2 x double>} %retval
 }
 
-declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
-declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
-declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
-declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
-declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
-declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
+declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
+declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
+declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
+declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
+declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index d98597fabcd9..7797577362c9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -21,7 +21,7 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv
 ; CHECK-NEXT:    vnsrl.wi v10, v12, 8
 ; CHECK-NEXT:    vmsne.vi v9, v10, 0
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.experimental.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
+%retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
 ret {<vscale x 16 x i1>, <vscale x 16 x i1>} %retval
 }
 
@@ -34,7 +34,7 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    vmv.v.v v10, v14
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
+%retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
 ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %retval
 }
 
@@ -47,7 +47,7 @@ define {<vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    vmv.v.v v10, v14
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
+%retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
 ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %retval
 }
 
@@ -61,7 +61,7 @@ define {<vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxv
 ; CHECK-NEXT:    vmv.v.v v8, v14
 ; CHECK-NEXT:    vmv.v.v v10, v12
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec)
+%retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec)
 ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %retval
 }
 
@@ -77,15 +77,15 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv
 ; CHECK-NEXT:    vmv2r.v v8, v12
 ; CHECK-NEXT:    vmv2r.v v10, v20
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
+%retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
 ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %retval
 }
 
-declare {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.experimental.vector.deinterleave2.nxv32i1(<vscale x 32 x i1>)
-declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
-declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
-declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
-declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
+declare {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1>)
+declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
+declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
 
 define {<vscale x 64 x i1>, <vscale x 64 x i1>} @vector_deinterleave_nxv64i1_nxv128i1(<vscale x 128 x i1> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv64i1_nxv128i1:
@@ -110,7 +110,7 @@ define {<vscale x 64 x i1>, <vscale x 64 x i1>} @vector_deinterleave_nxv64i1_nxv
 ; CHECK-NEXT:    vmsne.vi v9, v24, 0
 ; CHECK-NEXT:    vmv1r.v v8, v7
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 64 x i1>, <vscale x 64 x i1>} @llvm.experimental.vector.deinterleave2.nxv128i1(<vscale x 128 x i1> %vec)
+%retval = call {<vscale x 64 x i1>, <vscale x 64 x i1>} @llvm.vector.deinterleave2.nxv128i1(<vscale x 128 x i1> %vec)
 ret {<vscale x 64 x i1>, <vscale x 64 x i1>} %retval
 }
 
@@ -125,7 +125,7 @@ define {<vscale x 64 x i8>, <vscale x 64 x i8>} @vector_deinterleave_nxv64i8_nxv
 ; CHECK-NEXT:    vnsrl.wi v4, v16, 8
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 64 x i8>, <vscale x 64 x i8>} @llvm.experimental.vector.deinterleave2.nxv128i8(<vscale x 128 x i8> %vec)
+%retval = call {<vscale x 64 x i8>, <vscale x 64 x i8>} @llvm.vector.deinterleave2.nxv128i8(<vscale x 128 x i8> %vec)
 ret {<vscale x 64 x i8>, <vscale x 64 x i8>} %retval
 }
 
@@ -140,7 +140,7 @@ define {<vscale x 32 x i16>, <vscale x 32 x i16>} @vector_deinterleave_nxv32i16_
 ; CHECK-NEXT:    vnsrl.wi v4, v16, 16
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 32 x i16>, <vscale x 32 x i16>} @llvm.experimental.vector.deinterleave2.nxv64i16(<vscale x 64 x i16> %vec)
+%retval = call {<vscale x 32 x i16>, <vscale x 32 x i16>} @llvm.vector.deinterleave2.nxv64i16(<vscale x 64 x i16> %vec)
 ret {<vscale x 32 x i16>, <vscale x 32 x i16>} %retval
 }
 
@@ -156,7 +156,7 @@ define {<vscale x 16 x i32>, <vscale x 16 x i32>} @vector_deinterleave_nxv16i32_
 ; CHECK-NEXT:    vnsrl.wi v4, v24, 0
 ; CHECK-NEXT:    vmv8r.v v8, v0
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 16 x i32>, <vscale x 16 x i32>} @llvm.experimental.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %vec)
+%retval = call {<vscale x 16 x i32>, <vscale x 16 x i32>} @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %vec)
 ret {<vscale x 16 x i32>, <vscale x 16 x i32>} %retval
 }
 
@@ -229,15 +229,15 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_nxv8i64_nxv
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.experimental.vector.deinterleave2.nxv16i64(<vscale x 16 x i64> %vec)
+%retval = call {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64> %vec)
 ret {<vscale x 8 x i64>, <vscale x 8 x i64>} %retval
 }
 
-declare {<vscale x 64 x i1>, <vscale x 64 x i1>} @llvm.experimental.vector.deinterleave2.nxv128i1(<vscale x 128 x i1>)
-declare {<vscale x 64 x i8>, <vscale x 64 x i8>} @llvm.experimental.vector.deinterleave2.nxv128i8(<vscale x 128 x i8>)
-declare {<vscale x 32 x i16>, <vscale x 32 x i16>} @llvm.experimental.vector.deinterleave2.nxv64i16(<vscale x 64 x i16>)
-declare {<vscale x 16 x i32>, <vscale x 16 x i32>} @llvm.experimental.vector.deinterleave2.nxv32i32(<vscale x 32 x i32>)
-declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.experimental.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>)
+declare {<vscale x 64 x i1>, <vscale x 64 x i1>} @llvm.vector.deinterleave2.nxv128i1(<vscale x 128 x i1>)
+declare {<vscale x 64 x i8>, <vscale x 64 x i8>} @llvm.vector.deinterleave2.nxv128i8(<vscale x 128 x i8>)
+declare {<vscale x 32 x i16>, <vscale x 32 x i16>} @llvm.vector.deinterleave2.nxv64i16(<vscale x 64 x i16>)
+declare {<vscale x 16 x i32>, <vscale x 16 x i32>} @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32>)
+declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>)
 
 ; Floats
 
@@ -249,7 +249,7 @@ define {<vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_n
 ; CHECK-NEXT:    vnsrl.wi v9, v8, 16
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
+%retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
 ret {<vscale x 2 x half>, <vscale x 2 x half>} %retval
 }
 
@@ -262,7 +262,7 @@ define {<vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_n
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    vmv.v.v v9, v11
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec)
+%retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec)
 ret {<vscale x 4 x half>, <vscale x 4 x half>} %retval
 }
 
@@ -276,7 +276,7 @@ define {<vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32
 ; CHECK-NEXT:    vmv.v.v v8, v11
 ; CHECK-NEXT:    vmv.v.v v9, v10
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec)
+%retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec)
 ret {<vscale x 2 x float>, <vscale x 2 x float>} %retval
 }
 
@@ -289,7 +289,7 @@ define {<vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_n
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    vmv.v.v v10, v14
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %vec)
+%retval = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %vec)
 ret {<vscale x 8 x half>, <vscale x 8 x half>} %retval
 }
 
@@ -303,7 +303,7 @@ define {<vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_nxv4f32
 ; CHECK-NEXT:    vmv.v.v v8, v14
 ; CHECK-NEXT:    vmv.v.v v10, v12
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %vec)
+%retval = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %vec)
 ret  {<vscale x 4 x float>, <vscale x 4 x float>} %retval
 }
 
@@ -319,16 +319,16 @@ define {<vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f
 ; CHECK-NEXT:    vmv2r.v v8, v12
 ; CHECK-NEXT:    vmv2r.v v10, v20
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
+%retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
 ret {<vscale x 2 x double>, <vscale x 2 x double>} %retval
 }
 
-declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
-declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
-declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
-declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
-declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
-declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
+declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
+declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
+declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
+declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
+declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
 
 define {<vscale x 32 x half>, <vscale x 32 x half>} @vector_deinterleave_nxv32f16_nxv64f16(<vscale x 64 x half> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv32f16_nxv64f16:
@@ -341,7 +341,7 @@ define {<vscale x 32 x half>, <vscale x 32 x half>} @vector_deinterleave_nxv32f1
 ; CHECK-NEXT:    vnsrl.wi v4, v16, 16
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 32 x half>, <vscale x 32 x half>} @llvm.experimental.vector.deinterleave2.nxv64f16(<vscale x 64 x half> %vec)
+%retval = call {<vscale x 32 x half>, <vscale x 32 x half>} @llvm.vector.deinterleave2.nxv64f16(<vscale x 64 x half> %vec)
 ret {<vscale x 32 x half>, <vscale x 32 x half>} %retval
 }
 
@@ -357,7 +357,7 @@ define {<vscale x 16 x float>, <vscale x 16 x float>} @vector_deinterleave_nxv16
 ; CHECK-NEXT:    vnsrl.wi v4, v24, 0
 ; CHECK-NEXT:    vmv8r.v v8, v0
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 16 x float>, <vscale x 16 x float>} @llvm.experimental.vector.deinterleave2.nxv32f32(<vscale x 32 x float> %vec)
+%retval = call {<vscale x 16 x float>, <vscale x 16 x float>} @llvm.vector.deinterleave2.nxv32f32(<vscale x 32 x float> %vec)
 ret  {<vscale x 16 x float>, <vscale x 16 x float>} %retval
 }
 
@@ -430,10 +430,10 @@ define {<vscale x 8 x double>, <vscale x 8 x double>} @vector_deinterleave_nxv8f
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 8 x double>, <vscale x 8 x double>} @llvm.experimental.vector.deinterleave2.nxv16f64(<vscale x 16 x double> %vec)
+%retval = call {<vscale x 8 x double>, <vscale x 8 x double>} @llvm.vector.deinterleave2.nxv16f64(<vscale x 16 x double> %vec)
 ret {<vscale x 8 x double>, <vscale x 8 x double>} %retval
 }
 
-declare {<vscale x 32 x half>, <vscale x 32 x half>} @llvm.experimental.vector.deinterleave2.nxv64f16(<vscale x 64 x half>)
-declare {<vscale x 16 x float>, <vscale x 16 x float>} @llvm.experimental.vector.deinterleave2.nxv32f32(<vscale x 32 x float>)
-declare {<vscale x 8 x double>, <vscale x 8 x double>} @llvm.experimental.vector.deinterleave2.nxv16f64(<vscale x 16 x double>)
+declare {<vscale x 32 x half>, <vscale x 32 x half>} @llvm.vector.deinterleave2.nxv64f16(<vscale x 64 x half>)
+declare {<vscale x 16 x float>, <vscale x 16 x float>} @llvm.vector.deinterleave2.nxv32f32(<vscale x 32 x float>)
+declare {<vscale x 8 x double>, <vscale x 8 x double>} @llvm.vector.deinterleave2.nxv16f64(<vscale x 16 x double>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
index 6ebe8e095469..99872c199a1e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
@@ -41,7 +41,7 @@ define <32 x i1> @vector_interleave_v32i1_v16i1(<16 x i1> %a, <16 x i1> %b) {
 ; ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; ZVBB-NEXT:    vmsne.vi v0, v12, 0
 ; ZVBB-NEXT:    ret
-	   %res = call <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1> %a, <16 x i1> %b)
+	   %res = call <32 x i1> @llvm.vector.interleave2.v32i1(<16 x i1> %a, <16 x i1> %b)
 	   ret <32 x i1> %res
 }
 
@@ -62,7 +62,7 @@ define <16 x i16> @vector_interleave_v16i16_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv2r.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b)
+	   %res = call <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b)
 	   ret <16 x i16> %res
 }
 
@@ -84,7 +84,7 @@ define <8 x i32> @vector_interleave_v8i32_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv2r.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b)
+	   %res = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b)
 	   ret <8 x i32> %res
 }
 
@@ -118,14 +118,14 @@ define <4 x i64> @vector_interleave_v4i64_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; ZVBB-NEXT:    vrgatherei16.vv v10, v8, v12
 ; ZVBB-NEXT:    vmv.v.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %a, <2 x i64> %b)
+	   %res = call <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64> %a, <2 x i64> %b)
 	   ret <4 x i64> %res
 }
 
-declare <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1>, <16 x i1>)
-declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
-declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
-declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
+declare <32 x i1> @llvm.vector.interleave2.v32i1(<16 x i1>, <16 x i1>)
+declare <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
+declare <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
 
 ; Floats
 
@@ -146,7 +146,7 @@ define <4 x half> @vector_interleave_v4f16_v2f16(<2 x half> %a, <2 x half> %b) {
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv1r.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %a, <2 x half> %b)
+	   %res = call <4 x half> @llvm.vector.interleave2.v4f16(<2 x half> %a, <2 x half> %b)
 	   ret <4 x half> %res
 }
 
@@ -167,7 +167,7 @@ define <8 x half> @vector_interleave_v8f16_v4f16(<4 x half> %a, <4 x half> %b) {
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv1r.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %a, <4 x half> %b)
+	   %res = call <8 x half> @llvm.vector.interleave2.v8f16(<4 x half> %a, <4 x half> %b)
 	   ret <8 x half> %res
 }
 
@@ -189,7 +189,7 @@ define <4 x float> @vector_interleave_v4f32_v2f32(<2 x float> %a, <2 x float> %b
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv1r.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float> %a, <2 x float> %b)
+	   %res = call <4 x float> @llvm.vector.interleave2.v4f32(<2 x float> %a, <2 x float> %b)
 	   ret <4 x float> %res
 }
 
@@ -210,7 +210,7 @@ define <16 x half> @vector_interleave_v16f16_v8f16(<8 x half> %a, <8 x half> %b)
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv2r.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %a, <8 x half> %b)
+	   %res = call <16 x half> @llvm.vector.interleave2.v16f16(<8 x half> %a, <8 x half> %b)
 	   ret <16 x half> %res
 }
 
@@ -232,7 +232,7 @@ define <8 x float> @vector_interleave_v8f32_v4f32(<4 x float> %a, <4 x float> %b
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv2r.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b)
+	   %res = call <8 x float> @llvm.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b)
 	   ret <8 x float> %res
 }
 
@@ -266,17 +266,17 @@ define <4 x double> @vector_interleave_v4f64_v2f64(<2 x double> %a, <2 x double>
 ; ZVBB-NEXT:    vrgatherei16.vv v10, v8, v12
 ; ZVBB-NEXT:    vmv.v.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> %a, <2 x double> %b)
+	   %res = call <4 x double> @llvm.vector.interleave2.v4f64(<2 x double> %a, <2 x double> %b)
 	   ret <4 x double> %res
 }
 
 
-declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>)
-declare <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half>, <4 x half>)
-declare <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float>, <2 x float>)
-declare <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half>, <8 x half>)
-declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>)
-declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>)
+declare <4 x half> @llvm.vector.interleave2.v4f16(<2 x half>, <2 x half>)
+declare <8 x half> @llvm.vector.interleave2.v8f16(<4 x half>, <4 x half>)
+declare <4 x float> @llvm.vector.interleave2.v4f32(<2 x float>, <2 x float>)
+declare <16 x half> @llvm.vector.interleave2.v16f16(<8 x half>, <8 x half>)
+declare <8 x float> @llvm.vector.interleave2.v8f32(<4 x float>, <4 x float>)
+declare <4 x double> @llvm.vector.interleave2.v4f64(<2 x double>, <2 x double>)
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; RV32: {{.*}}
 ; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
index 922692ed88c9..7ade47e60bc6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
@@ -27,7 +27,7 @@ define void @vector_interleave_store_nxv32i1_nxv16i1(<vscale x 16 x i1> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vsm.v v9, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
+  %res = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
   store <vscale x 32 x i1> %res, ptr %p
   ret void
 }
@@ -42,7 +42,7 @@ define void @vector_interleave_store_nxv16i16_nxv8i16_align1(<vscale x 8 x i16>
 ; CHECK-NEXT:    vwmaccu.vx v12, a1, v10
 ; CHECK-NEXT:    vs4r.v v12, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
+  %res = call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
   store <vscale x 16 x i16> %res, ptr %p, align 1
   ret void
 }
@@ -53,7 +53,7 @@ define void @vector_interleave_store_nxv16i16_nxv8i16(<vscale x 8 x i16> %a, <vs
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vsseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
+  %res = call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
   store <vscale x 16 x i16> %res, ptr %p
   ret void
 }
@@ -64,7 +64,7 @@ define void @vector_interleave_store_nxv8i32_nxv4i32(<vscale x 4 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vsseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  %res = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
   store <vscale x 8 x i32> %res, ptr %p
   ret void
 }
@@ -75,7 +75,7 @@ define void @vector_interleave_store_nxv4i64_nxv2i64(<vscale x 2 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vsseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
+  %res = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
   store <vscale x 4 x i64> %res, ptr %p
   ret void
 }
@@ -86,7 +86,7 @@ define void @vector_interleave_store_nxv8i64_nxv4i64(<vscale x 4 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vsseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b)
+  %res = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b)
   store <vscale x 8 x i64> %res, ptr %p
   ret void
 }
@@ -138,17 +138,17 @@ define void @vector_interleave_store_nxv16i64_nxv8i64(<vscale x 8 x i64> %a, <vs
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i64> @llvm.experimental.vector.interleave2.nxv16i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b)
+  %res = call <vscale x 16 x i64> @llvm.vector.interleave2.nxv16i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b)
   store <vscale x 16 x i64> %res, ptr %p
   ret void
 }
 
-declare <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
-declare <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
-declare <vscale x 16 x i64> @llvm.experimental.vector.interleave2.nxv16i64(<vscale x 8 x i64>, <vscale x 8 x i64>)
+declare <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
+declare <vscale x 16 x i64> @llvm.vector.interleave2.nxv16i64(<vscale x 8 x i64>, <vscale x 8 x i64>)
 
 ; Floats
 
@@ -158,7 +158,7 @@ define void @vector_interleave_store_nxv4f16_nxv2f16(<vscale x 2 x half> %a, <vs
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
+  %res = call <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
   store <vscale x 4 x half> %res, ptr %p
   ret void
 }
@@ -169,7 +169,7 @@ define void @vector_interleave_store_nxv8f16_nxv4f16(<vscale x 4 x half> %a, <vs
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vsseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
+  %res = call <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
   store <vscale x 8 x half> %res, ptr %p
   ret void
 }
@@ -180,7 +180,7 @@ define void @vector_interleave_store_nxv4f32_nxv2f32(<vscale x 2 x float> %a, <v
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vsseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
+  %res = call <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
   store <vscale x 4 x float> %res, ptr %p
   ret void
 }
@@ -191,7 +191,7 @@ define void @vector_interleave_store_nxv16f16_nxv8f16(<vscale x 8 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vsseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  %res = call <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
   store <vscale x 16 x half> %res, ptr %p
   ret void
 }
@@ -202,7 +202,7 @@ define void @vector_interleave_store_nxv8f32_nxv4f32(<vscale x 4 x float> %a, <v
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vsseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  %res = call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
   store <vscale x 8 x float> %res, ptr %p
   ret void
 }
@@ -213,15 +213,15 @@ define void @vector_interleave_store_nxv4f64_nxv2f64(<vscale x 2 x double> %a, <
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vsseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  %res = call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
   store <vscale x 4 x double> %res, ptr %p
   ret void
 }
 
 
-declare <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
-declare <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
-declare <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
-declare <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
+declare <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
+declare <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
+declare <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
index 327e18e91381..a7e0ad6ee5f4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
@@ -47,7 +47,7 @@ define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v0, v8, a0
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
+  %res = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
   ret <vscale x 32 x i1> %res
 }
 
@@ -68,7 +68,7 @@ define <vscale x 32 x i8> @vector_interleave_nxv32i8_nxv16i8(<vscale x 16 x i8>
 ; ZVBB-NEXT:    vwaddu.wv v12, v12, v8
 ; ZVBB-NEXT:    vmv4r.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
+  %res = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
   ret <vscale x 32 x i8> %res
 }
 
@@ -89,7 +89,7 @@ define <vscale x 16 x i16> @vector_interleave_nxv16i16_nxv8i16(<vscale x 8 x i16
 ; ZVBB-NEXT:    vwaddu.wv v12, v12, v8
 ; ZVBB-NEXT:    vmv4r.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
+  %res = call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
   ret <vscale x 16 x i16> %res
 }
 
@@ -111,7 +111,7 @@ define <vscale x 8 x i32> @vector_interleave_nxv8i32_nxv4i32(<vscale x 4 x i32>
 ; ZVBB-NEXT:    vwaddu.wv v12, v12, v8
 ; ZVBB-NEXT:    vmv4r.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  %res = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
   ret <vscale x 8 x i32> %res
 }
 
@@ -145,15 +145,15 @@ define <vscale x 4 x i64> @vector_interleave_nxv4i64_nxv2i64(<vscale x 2 x i64>
 ; ZVBB-NEXT:    vrgatherei16.vv v12, v8, v16
 ; ZVBB-NEXT:    vmv.v.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
+  %res = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
   ret <vscale x 4 x i64> %res
 }
 
-declare <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
-declare <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
 
 define <vscale x 128 x i1> @vector_interleave_nxv128i1_nxv64i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b) {
 ; CHECK-LABEL: vector_interleave_nxv128i1_nxv64i1:
@@ -196,7 +196,7 @@ define <vscale x 128 x i1> @vector_interleave_nxv128i1_nxv64i1(<vscale x 64 x i1
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; ZVBB-NEXT:    vmsne.vi v8, v24, 0
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 128 x i1> @llvm.experimental.vector.interleave2.nxv128i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b)
+  %res = call <vscale x 128 x i1> @llvm.vector.interleave2.nxv128i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b)
   ret <vscale x 128 x i1> %res
 }
 
@@ -223,7 +223,7 @@ define <vscale x 128 x i8> @vector_interleave_nxv128i8_nxv64i8(<vscale x 64 x i8
 ; ZVBB-NEXT:    vwaddu.wv v0, v0, v28
 ; ZVBB-NEXT:    vmv8r.v v16, v0
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 128 x i8> @llvm.experimental.vector.interleave2.nxv128i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b)
+  %res = call <vscale x 128 x i8> @llvm.vector.interleave2.nxv128i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b)
   ret <vscale x 128 x i8> %res
 }
 
@@ -250,7 +250,7 @@ define <vscale x 64 x i16> @vector_interleave_nxv64i16_nxv32i16(<vscale x 32 x i
 ; ZVBB-NEXT:    vwaddu.wv v0, v0, v28
 ; ZVBB-NEXT:    vmv8r.v v16, v0
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 64 x i16> @llvm.experimental.vector.interleave2.nxv64i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b)
+  %res = call <vscale x 64 x i16> @llvm.vector.interleave2.nxv64i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b)
   ret <vscale x 64 x i16> %res
 }
 
@@ -278,7 +278,7 @@ define <vscale x 32 x i32> @vector_interleave_nxv32i32_nxv16i32(<vscale x 16 x i
 ; ZVBB-NEXT:    vmv8r.v v8, v24
 ; ZVBB-NEXT:    vmv8r.v v16, v0
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 32 x i32> @llvm.experimental.vector.interleave2.nxv32i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b)
+  %res = call <vscale x 32 x i32> @llvm.vector.interleave2.nxv32i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b)
   ret <vscale x 32 x i32> %res
 }
 
@@ -376,15 +376,15 @@ define <vscale x 16 x i64> @vector_interleave_nxv16i64_nxv8i64(<vscale x 8 x i64
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 16 x i64> @llvm.experimental.vector.interleave2.nxv16i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b)
+  %res = call <vscale x 16 x i64> @llvm.vector.interleave2.nxv16i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b)
   ret <vscale x 16 x i64> %res
 }
 
-declare <vscale x 128 x i1> @llvm.experimental.vector.interleave2.nxv128i1(<vscale x 64 x i1>, <vscale x 64 x i1>)
-declare <vscale x 128 x i8> @llvm.experimental.vector.interleave2.nxv128i8(<vscale x 64 x i8>, <vscale x 64 x i8>)
-declare <vscale x 64 x i16> @llvm.experimental.vector.interleave2.nxv64i16(<vscale x 32 x i16>, <vscale x 32 x i16>)
-declare <vscale x 32 x i32> @llvm.experimental.vector.interleave2.nxv32i32(<vscale x 16 x i32>, <vscale x 16 x i32>)
-declare <vscale x 16 x i64> @llvm.experimental.vector.interleave2.nxv16i64(<vscale x 8 x i64>, <vscale x 8 x i64>)
+declare <vscale x 128 x i1> @llvm.vector.interleave2.nxv128i1(<vscale x 64 x i1>, <vscale x 64 x i1>)
+declare <vscale x 128 x i8> @llvm.vector.interleave2.nxv128i8(<vscale x 64 x i8>, <vscale x 64 x i8>)
+declare <vscale x 64 x i16> @llvm.vector.interleave2.nxv64i16(<vscale x 32 x i16>, <vscale x 32 x i16>)
+declare <vscale x 32 x i32> @llvm.vector.interleave2.nxv32i32(<vscale x 16 x i32>, <vscale x 16 x i32>)
+declare <vscale x 16 x i64> @llvm.vector.interleave2.nxv16i64(<vscale x 8 x i64>, <vscale x 8 x i64>)
 
 ; Floats
 
@@ -419,7 +419,7 @@ define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half
 ; ZVBB-NEXT:    vslideup.vx v10, v8, a0
 ; ZVBB-NEXT:    vmv.v.v v8, v10
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
+  %res = call <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
   ret <vscale x 4 x half> %res
 }
 
@@ -440,7 +440,7 @@ define <vscale x 8 x half> @vector_interleave_nxv8f16_nxv4f16(<vscale x 4 x half
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv2r.v v8, v10
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
+  %res = call <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
   ret <vscale x 8 x half> %res
 }
 
@@ -462,7 +462,7 @@ define <vscale x 4 x float> @vector_interleave_nxv4f32_nxv2f32(<vscale x 2 x flo
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv2r.v v8, v10
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
+  %res = call <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
   ret <vscale x 4 x float> %res
 }
 
@@ -483,7 +483,7 @@ define <vscale x 16 x half> @vector_interleave_nxv16f16_nxv8f16(<vscale x 8 x ha
 ; ZVBB-NEXT:    vwaddu.wv v12, v12, v8
 ; ZVBB-NEXT:    vmv4r.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  %res = call <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
   ret <vscale x 16 x half> %res
 }
 
@@ -505,7 +505,7 @@ define <vscale x 8 x float> @vector_interleave_nxv8f32_nxv4f32(<vscale x 4 x flo
 ; ZVBB-NEXT:    vwaddu.wv v12, v12, v8
 ; ZVBB-NEXT:    vmv4r.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  %res = call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
   ret <vscale x 8 x float> %res
 }
 
@@ -539,17 +539,17 @@ define <vscale x 4 x double> @vector_interleave_nxv4f64_nxv2f64(<vscale x 2 x do
 ; ZVBB-NEXT:    vrgatherei16.vv v12, v8, v16
 ; ZVBB-NEXT:    vmv.v.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  %res = call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
   ret <vscale x 4 x double> %res
 }
 
 
-declare <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
-declare <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
-declare <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
-declare <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
+declare <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
+declare <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
+declare <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
 
 define <vscale x 64 x half> @vector_interleave_nxv64f16_nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b) {
 ; CHECK-LABEL: vector_interleave_nxv64f16_nxv32f16:
@@ -574,7 +574,7 @@ define <vscale x 64 x half> @vector_interleave_nxv64f16_nxv32f16(<vscale x 32 x
 ; ZVBB-NEXT:    vwaddu.wv v0, v0, v28
 ; ZVBB-NEXT:    vmv8r.v v16, v0
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 64 x half> @llvm.experimental.vector.interleave2.nxv64f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b)
+  %res = call <vscale x 64 x half> @llvm.vector.interleave2.nxv64f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b)
   ret <vscale x 64 x half> %res
 }
 
@@ -602,7 +602,7 @@ define <vscale x 32 x float> @vector_interleave_nxv32f32_nxv16f32(<vscale x 16 x
 ; ZVBB-NEXT:    vmv8r.v v8, v24
 ; ZVBB-NEXT:    vmv8r.v v16, v0
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 32 x float> @llvm.experimental.vector.interleave2.nxv32f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b)
+  %res = call <vscale x 32 x float> @llvm.vector.interleave2.nxv32f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b)
   ret <vscale x 32 x float> %res
 }
 
@@ -700,7 +700,7 @@ define <vscale x 16 x double> @vector_interleave_nxv16f64_nxv8f64(<vscale x 8 x
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 16 x double> @llvm.experimental.vector.interleave2.nxv16f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b)
+  %res = call <vscale x 16 x double> @llvm.vector.interleave2.nxv16f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b)
   ret <vscale x 16 x double> %res
 }
 
@@ -718,7 +718,7 @@ define <vscale x 8 x i32> @vector_interleave_nxv8i32_nxv4i32_poison(<vscale x 4
 ; ZVBB-NEXT:    vzext.vf2 v12, v8
 ; ZVBB-NEXT:    vmv.v.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> poison)
+  %res = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> poison)
   ret <vscale x 8 x i32> %res
 }
 
@@ -738,10 +738,10 @@ define <vscale x 8 x i32> @vector_interleave_nxv8i32_nxv4i32_poison2(<vscale x 4
 ; ZVBB-NEXT:    vwsll.vx v12, v8, a0
 ; ZVBB-NEXT:    vmv4r.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a)
+  %res = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a)
   ret <vscale x 8 x i32> %res
 }
 
-declare <vscale x 64 x half> @llvm.experimental.vector.interleave2.nxv64f16(<vscale x 32 x half>, <vscale x 32 x half>)
-declare <vscale x 32 x float> @llvm.experimental.vector.interleave2.nxv32f32(<vscale x 16 x float>, <vscale x 16 x float>)
-declare <vscale x 16 x double> @llvm.experimental.vector.interleave2.nxv16f64(<vscale x 8 x double>, <vscale x 8 x double>)
+declare <vscale x 64 x half> @llvm.vector.interleave2.nxv64f16(<vscale x 32 x half>, <vscale x 32 x half>)
+declare <vscale x 32 x float> @llvm.vector.interleave2.nxv32f32(<vscale x 16 x float>, <vscale x 16 x float>)
+declare <vscale x 16 x double> @llvm.vector.interleave2.nxv16f64(<vscale x 8 x double>, <vscale x 8 x double>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
index c98242437f62..be56db52e349 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
@@ -4,7 +4,7 @@
 
 ; Tests assume VLEN=128 or vscale_range_min=2.
 
-declare <vscale x 1 x i1> @llvm.experimental.vector.splice.nxv1i1(<vscale x 1 x i1>, <vscale x 1 x i1>, i32)
+declare <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1>, <vscale x 1 x i1>, i32)
 
 define <vscale x 1 x i1> @splice_nxv1i1_offset_negone(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv1i1_offset_negone:
@@ -24,7 +24,7 @@ define <vscale x 1 x i1> @splice_nxv1i1_offset_negone(<vscale x 1 x i1> %a, <vsc
 ; CHECK-NEXT:    vand.vi v8, v9, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i1> @llvm.experimental.vector.splice.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, i32 -1)
+  %res = call <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, i32 -1)
   ret <vscale x 1 x i1> %res
 }
 
@@ -48,11 +48,11 @@ define <vscale x 1 x i1> @splice_nxv1i1_offset_max(<vscale x 1 x i1> %a, <vscale
 ; CHECK-NEXT:    vand.vi v8, v9, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i1> @llvm.experimental.vector.splice.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, i32 1)
+  %res = call <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, i32 1)
   ret <vscale x 1 x i1> %res
 }
 
-declare <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
+declare <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
 
 define <vscale x 2 x i1> @splice_nxv2i1_offset_negone(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv2i1_offset_negone:
@@ -72,7 +72,7 @@ define <vscale x 2 x i1> @splice_nxv2i1_offset_negone(<vscale x 2 x i1> %a, <vsc
 ; CHECK-NEXT:    vand.vi v8, v9, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 -1)
+  %res = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 -1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -96,11 +96,11 @@ define <vscale x 2 x i1> @splice_nxv2i1_offset_max(<vscale x 2 x i1> %a, <vscale
 ; CHECK-NEXT:    vand.vi v8, v9, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 3)
+  %res = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 3)
   ret <vscale x 2 x i1> %res
 }
 
-declare <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
 
 define <vscale x 4 x i1> @splice_nxv4i1_offset_negone(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv4i1_offset_negone:
@@ -120,7 +120,7 @@ define <vscale x 4 x i1> @splice_nxv4i1_offset_negone(<vscale x 4 x i1> %a, <vsc
 ; CHECK-NEXT:    vand.vi v8, v9, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 -1)
+  %res = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 -1)
   ret <vscale x 4 x i1> %res
 }
 
@@ -144,11 +144,11 @@ define <vscale x 4 x i1> @splice_nxv4i1_offset_max(<vscale x 4 x i1> %a, <vscale
 ; CHECK-NEXT:    vand.vi v8, v9, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 7)
+  %res = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 7)
   ret <vscale x 4 x i1> %res
 }
 
-declare <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
+declare <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
 
 define <vscale x 8 x i1> @splice_nxv8i1_offset_negone(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv8i1_offset_negone:
@@ -167,7 +167,7 @@ define <vscale x 8 x i1> @splice_nxv8i1_offset_negone(<vscale x 8 x i1> %a, <vsc
 ; CHECK-NEXT:    vand.vi v8, v9, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 -1)
+  %res = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 -1)
   ret <vscale x 8 x i1> %res
 }
 
@@ -190,11 +190,11 @@ define <vscale x 8 x i1> @splice_nxv8i1_offset_max(<vscale x 8 x i1> %a, <vscale
 ; CHECK-NEXT:    vand.vi v8, v9, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 15)
+  %res = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 15)
   ret <vscale x 8 x i1> %res
 }
 
-declare <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
+declare <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
 
 define <vscale x 16 x i1> @splice_nxv16i1_offset_negone(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv16i1_offset_negone:
@@ -216,7 +216,7 @@ define <vscale x 16 x i1> @splice_nxv16i1_offset_negone(<vscale x 16 x i1> %a, <
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 -1)
+  %res = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 -1)
   ret <vscale x 16 x i1> %res
 }
 
@@ -240,11 +240,11 @@ define <vscale x 16 x i1> @splice_nxv16i1_offset_max(<vscale x 16 x i1> %a, <vsc
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 31)
+  %res = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 31)
   ret <vscale x 16 x i1> %res
 }
 
-declare <vscale x 32 x i1> @llvm.experimental.vector.splice.nxv32i1(<vscale x 32 x i1>, <vscale x 32 x i1>, i32)
+declare <vscale x 32 x i1> @llvm.vector.splice.nxv32i1(<vscale x 32 x i1>, <vscale x 32 x i1>, i32)
 
 define <vscale x 32 x i1> @splice_nxv32i1_offset_negone(<vscale x 32 x i1> %a, <vscale x 32 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv32i1_offset_negone:
@@ -266,7 +266,7 @@ define <vscale x 32 x i1> @splice_nxv32i1_offset_negone(<vscale x 32 x i1> %a, <
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i1> @llvm.experimental.vector.splice.nxv32i1(<vscale x 32 x i1> %a, <vscale x 32 x i1> %b, i32 -1)
+  %res = call <vscale x 32 x i1> @llvm.vector.splice.nxv32i1(<vscale x 32 x i1> %a, <vscale x 32 x i1> %b, i32 -1)
   ret <vscale x 32 x i1> %res
 }
 
@@ -289,11 +289,11 @@ define <vscale x 32 x i1> @splice_nxv32i1_offset_max(<vscale x 32 x i1> %a, <vsc
 ; CHECK-NEXT:    vand.vi v8, v16, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i1> @llvm.experimental.vector.splice.nxv32i1(<vscale x 32 x i1> %a, <vscale x 32 x i1> %b, i32 63)
+  %res = call <vscale x 32 x i1> @llvm.vector.splice.nxv32i1(<vscale x 32 x i1> %a, <vscale x 32 x i1> %b, i32 63)
   ret <vscale x 32 x i1> %res
 }
 
-declare <vscale x 64 x i1> @llvm.experimental.vector.splice.nxv64i1(<vscale x 64 x i1>, <vscale x 64 x i1>, i32)
+declare <vscale x 64 x i1> @llvm.vector.splice.nxv64i1(<vscale x 64 x i1>, <vscale x 64 x i1>, i32)
 
 define <vscale x 64 x i1> @splice_nxv64i1_offset_negone(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv64i1_offset_negone:
@@ -315,7 +315,7 @@ define <vscale x 64 x i1> @splice_nxv64i1_offset_negone(<vscale x 64 x i1> %a, <
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 64 x i1> @llvm.experimental.vector.splice.nxv64i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b, i32 -1)
+  %res = call <vscale x 64 x i1> @llvm.vector.splice.nxv64i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b, i32 -1)
   ret <vscale x 64 x i1> %res
 }
 
@@ -338,17 +338,17 @@ define <vscale x 64 x i1> @splice_nxv64i1_offset_max(<vscale x 64 x i1> %a, <vsc
 ; CHECK-NEXT:    vand.vi v8, v24, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 64 x i1> @llvm.experimental.vector.splice.nxv64i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b, i32 127)
+  %res = call <vscale x 64 x i1> @llvm.vector.splice.nxv64i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b, i32 127)
   ret <vscale x 64 x i1> %res
 }
 
-declare <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, i32)
+declare <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, i32)
 
 define <vscale x 1 x i8> @splice_nxv1i8_offset_zero(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b) #0 {
 ; CHECK-LABEL: splice_nxv1i8_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, i32 0)
+  %res = call <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, i32 0)
   ret <vscale x 1 x i8> %res
 }
 
@@ -363,7 +363,7 @@ define <vscale x 1 x i8> @splice_nxv1i8_offset_negone(<vscale x 1 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, i32 -1)
+  %res = call <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, i32 -1)
   ret <vscale x 1 x i8> %res
 }
 
@@ -378,7 +378,7 @@ define <vscale x 1 x i8> @splice_nxv1i8_offset_min(<vscale x 1 x i8> %a, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, i32 -2)
+  %res = call <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, i32 -2)
   ret <vscale x 1 x i8> %res
 }
 
@@ -393,17 +393,17 @@ define <vscale x 1 x i8> @splice_nxv1i8_offset_max(<vscale x 1 x i8> %a, <vscale
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, i32 1)
+  %res = call <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, i32 1)
   ret <vscale x 1 x i8> %res
 }
 
-declare <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
+declare <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
 
 define <vscale x 2 x i8> @splice_nxv2i8_offset_zero(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b) #0 {
 ; CHECK-LABEL: splice_nxv2i8_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 0)
+  %res = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 0)
   ret <vscale x 2 x i8> %res
 }
 
@@ -418,7 +418,7 @@ define <vscale x 2 x i8> @splice_nxv2i8_offset_negone(<vscale x 2 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 -1)
+  %res = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 -1)
   ret <vscale x 2 x i8> %res
 }
 
@@ -433,7 +433,7 @@ define <vscale x 2 x i8> @splice_nxv2i8_offset_min(<vscale x 2 x i8> %a, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 -4)
+  %res = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 -4)
   ret <vscale x 2 x i8> %res
 }
 
@@ -448,17 +448,17 @@ define <vscale x 2 x i8> @splice_nxv2i8_offset_max(<vscale x 2 x i8> %a, <vscale
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 3)
+  %res = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 3)
   ret <vscale x 2 x i8> %res
 }
 
-declare <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, i32)
+declare <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, i32)
 
 define <vscale x 4 x i8> @splice_nxv4i8_offset_zero(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b) #0 {
 ; CHECK-LABEL: splice_nxv4i8_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, i32 0)
+  %res = call <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, i32 0)
   ret <vscale x 4 x i8> %res
 }
 
@@ -473,7 +473,7 @@ define <vscale x 4 x i8> @splice_nxv4i8_offset_negone(<vscale x 4 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, i32 -1)
+  %res = call <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, i32 -1)
   ret <vscale x 4 x i8> %res
 }
 
@@ -488,7 +488,7 @@ define <vscale x 4 x i8> @splice_nxv4i8_offset_min(<vscale x 4 x i8> %a, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, i32 -8)
+  %res = call <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, i32 -8)
   ret <vscale x 4 x i8> %res
 }
 
@@ -503,17 +503,17 @@ define <vscale x 4 x i8> @splice_nxv4i8_offset_max(<vscale x 4 x i8> %a, <vscale
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, i32 7)
+  %res = call <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, i32 7)
   ret <vscale x 4 x i8> %res
 }
 
-declare <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, i32)
+declare <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, i32)
 
 define <vscale x 8 x i8> @splice_nxv8i8_offset_zero(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) #0 {
 ; CHECK-LABEL: splice_nxv8i8_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, i32 0)
+  %res = call <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, i32 0)
   ret <vscale x 8 x i8> %res
 }
 
@@ -527,7 +527,7 @@ define <vscale x 8 x i8> @splice_nxv8i8_offset_negone(<vscale x 8 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, i32 -1)
+  %res = call <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, i32 -1)
   ret <vscale x 8 x i8> %res
 }
 
@@ -541,7 +541,7 @@ define <vscale x 8 x i8> @splice_nxv8i8_offset_min(<vscale x 8 x i8> %a, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, i32 -16)
+  %res = call <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, i32 -16)
   ret <vscale x 8 x i8> %res
 }
 
@@ -555,17 +555,17 @@ define <vscale x 8 x i8> @splice_nxv8i8_offset_max(<vscale x 8 x i8> %a, <vscale
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, i32 15)
+  %res = call <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, i32 15)
   ret <vscale x 8 x i8> %res
 }
 
-declare <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
+declare <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
 
 define <vscale x 16 x i8> @splice_nxv16i8_offset_zero(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: splice_nxv16i8_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 0)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 0)
   ret <vscale x 16 x i8> %res
 }
 
@@ -580,7 +580,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_offset_negone(<vscale x 16 x i8> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -1)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -1)
   ret <vscale x 16 x i8> %res
 }
 
@@ -596,7 +596,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_offset_min(<vscale x 16 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -32)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -32)
   ret <vscale x 16 x i8> %res
 }
 
@@ -611,17 +611,17 @@ define <vscale x 16 x i8> @splice_nxv16i8_offset_max(<vscale x 16 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 31)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 31)
   ret <vscale x 16 x i8> %res
 }
 
-declare <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, i32)
+declare <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, i32)
 
 define <vscale x 32 x i8> @splice_nxv32i8_offset_zero(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) #0 {
 ; CHECK-LABEL: splice_nxv32i8_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, i32 0)
+  %res = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, i32 0)
   ret <vscale x 32 x i8> %res
 }
 
@@ -636,7 +636,7 @@ define <vscale x 32 x i8> @splice_nxv32i8_offset_negone(<vscale x 32 x i8> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, i32 -1)
+  %res = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, i32 -1)
   ret <vscale x 32 x i8> %res
 }
 
@@ -652,7 +652,7 @@ define <vscale x 32 x i8> @splice_nxv32i8_offset_min(<vscale x 32 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, i32 -64)
+  %res = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, i32 -64)
   ret <vscale x 32 x i8> %res
 }
 
@@ -668,17 +668,17 @@ define <vscale x 32 x i8> @splice_nxv32i8_offset_max(<vscale x 32 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, i32 63)
+  %res = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, i32 63)
   ret <vscale x 32 x i8> %res
 }
 
-declare <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, i32)
+declare <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, i32)
 
 define <vscale x 64 x i8> @splice_nxv64i8_offset_zero(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b) #0 {
 ; CHECK-LABEL: splice_nxv64i8_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, i32 0)
+  %res = call <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, i32 0)
   ret <vscale x 64 x i8> %res
 }
 
@@ -693,7 +693,7 @@ define <vscale x 64 x i8> @splice_nxv64i8_offset_negone(<vscale x 64 x i8> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, i32 -1)
+  %res = call <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, i32 -1)
   ret <vscale x 64 x i8> %res
 }
 
@@ -709,7 +709,7 @@ define <vscale x 64 x i8> @splice_nxv64i8_offset_min(<vscale x 64 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, i32 -128)
+  %res = call <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, i32 -128)
   ret <vscale x 64 x i8> %res
 }
 
@@ -725,17 +725,17 @@ define <vscale x 64 x i8> @splice_nxv64i8_offset_max(<vscale x 64 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, i32 127)
+  %res = call <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, i32 127)
   ret <vscale x 64 x i8> %res
 }
 
-declare <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, i32)
+declare <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, i32)
 
 define <vscale x 1 x i16> @splice_nxv1i16_offset_zero(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b) #0 {
 ; CHECK-LABEL: splice_nxv1i16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, i32 0)
+  %res = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, i32 0)
   ret <vscale x 1 x i16> %res
 }
 
@@ -750,7 +750,7 @@ define <vscale x 1 x i16> @splice_nxv1i16_offset_negone(<vscale x 1 x i16> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, i32 -1)
+  %res = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, i32 -1)
   ret <vscale x 1 x i16> %res
 }
 
@@ -765,7 +765,7 @@ define <vscale x 1 x i16> @splice_nxv1i16_offset_min(<vscale x 1 x i16> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, i32 -2)
+  %res = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, i32 -2)
   ret <vscale x 1 x i16> %res
 }
 
@@ -780,17 +780,17 @@ define <vscale x 1 x i16> @splice_nxv1i16_offset_max(<vscale x 1 x i16> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, i32 1)
+  %res = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, i32 1)
   ret <vscale x 1 x i16> %res
 }
 
-declare <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, i32)
+declare <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, i32)
 
 define <vscale x 2 x i16> @splice_nxv2i16_offset_zero(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b) #0 {
 ; CHECK-LABEL: splice_nxv2i16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, i32 0)
+  %res = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, i32 0)
   ret <vscale x 2 x i16> %res
 }
 
@@ -805,7 +805,7 @@ define <vscale x 2 x i16> @splice_nxv2i16_offset_negone(<vscale x 2 x i16> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, i32 -1)
+  %res = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, i32 -1)
   ret <vscale x 2 x i16> %res
 }
 
@@ -820,7 +820,7 @@ define <vscale x 2 x i16> @splice_nxv2i16_offset_min(<vscale x 2 x i16> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, i32 -4)
+  %res = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, i32 -4)
   ret <vscale x 2 x i16> %res
 }
 
@@ -835,17 +835,17 @@ define <vscale x 2 x i16> @splice_nxv2i16_offset_max(<vscale x 2 x i16> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, i32 3)
+  %res = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, i32 3)
   ret <vscale x 2 x i16> %res
 }
 
-declare <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32)
+declare <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32)
 
 define <vscale x 4 x i16> @splice_nxv4i16_offset_zero(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b) #0 {
 ; CHECK-LABEL: splice_nxv4i16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, i32 0)
+  %res = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, i32 0)
   ret <vscale x 4 x i16> %res
 }
 
@@ -860,7 +860,7 @@ define <vscale x 4 x i16> @splice_nxv4i16_offset_negone(<vscale x 4 x i16> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, i32 -1)
+  %res = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, i32 -1)
   ret <vscale x 4 x i16> %res
 }
 
@@ -875,7 +875,7 @@ define <vscale x 4 x i16> @splice_nxv4i16_offset_min(<vscale x 4 x i16> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, i32 -8)
+  %res = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, i32 -8)
   ret <vscale x 4 x i16> %res
 }
 
@@ -890,17 +890,17 @@ define <vscale x 4 x i16> @splice_nxv4i16_offset_max(<vscale x 4 x i16> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, i32 7)
+  %res = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, i32 7)
   ret <vscale x 4 x i16> %res
 }
 
-declare <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
 
 define <vscale x 8 x i16> @splice_nxv8i16_offset_zero(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
 ; CHECK-LABEL: splice_nxv8i16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 0)
+  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 0)
   ret <vscale x 8 x i16> %res
 }
 
@@ -914,7 +914,7 @@ define <vscale x 8 x i16> @splice_nxv8i16_offset_negone(<vscale x 8 x i16> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -1)
+  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -1)
   ret <vscale x 8 x i16> %res
 }
 
@@ -928,7 +928,7 @@ define <vscale x 8 x i16> @splice_nxv8i16_offset_min(<vscale x 8 x i16> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -16)
+  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -16)
   ret <vscale x 8 x i16> %res
 }
 
@@ -942,17 +942,17 @@ define <vscale x 8 x i16> @splice_nxv8i16_offset_max(<vscale x 8 x i16> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 15)
+  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 15)
   ret <vscale x 8 x i16> %res
 }
 
-declare <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, i32)
+declare <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, i32)
 
 define <vscale x 16 x i16> @splice_nxv16i16_offset_zero(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) #0 {
 ; CHECK-LABEL: splice_nxv16i16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, i32 0)
+  %res = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, i32 0)
   ret <vscale x 16 x i16> %res
 }
 
@@ -967,7 +967,7 @@ define <vscale x 16 x i16> @splice_nxv16i16_offset_negone(<vscale x 16 x i16> %a
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, i32 -1)
+  %res = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, i32 -1)
   ret <vscale x 16 x i16> %res
 }
 
@@ -983,7 +983,7 @@ define <vscale x 16 x i16> @splice_nxv16i16_offset_min(<vscale x 16 x i16> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, i32 -32)
+  %res = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, i32 -32)
   ret <vscale x 16 x i16> %res
 }
 
@@ -998,17 +998,17 @@ define <vscale x 16 x i16> @splice_nxv16i16_offset_max(<vscale x 16 x i16> %a, <
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, i32 31)
+  %res = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, i32 31)
   ret <vscale x 16 x i16> %res
 }
 
-declare <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, i32)
+declare <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, i32)
 
 define <vscale x 32 x i16> @splice_nxv32i16_offset_zero(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b) #0 {
 ; CHECK-LABEL: splice_nxv32i16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, i32 0)
+  %res = call <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, i32 0)
   ret <vscale x 32 x i16> %res
 }
 
@@ -1023,7 +1023,7 @@ define <vscale x 32 x i16> @splice_nxv32i16_offset_negone(<vscale x 32 x i16> %a
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, i32 -1)
+  %res = call <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, i32 -1)
   ret <vscale x 32 x i16> %res
 }
 
@@ -1039,7 +1039,7 @@ define <vscale x 32 x i16> @splice_nxv32i16_offset_min(<vscale x 32 x i16> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, i32 -64)
+  %res = call <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, i32 -64)
   ret <vscale x 32 x i16> %res
 }
 
@@ -1055,17 +1055,17 @@ define <vscale x 32 x i16> @splice_nxv32i16_offset_max(<vscale x 32 x i16> %a, <
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, i32 63)
+  %res = call <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, i32 63)
   ret <vscale x 32 x i16> %res
 }
 
-declare <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, i32)
+declare <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, i32)
 
 define <vscale x 1 x i32> @splice_nxv1i32_offset_zero(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b) #0 {
 ; CHECK-LABEL: splice_nxv1i32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, i32 0)
+  %res = call <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, i32 0)
   ret <vscale x 1 x i32> %res
 }
 
@@ -1080,7 +1080,7 @@ define <vscale x 1 x i32> @splice_nxv1i32_offset_negone(<vscale x 1 x i32> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, i32 -1)
+  %res = call <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, i32 -1)
   ret <vscale x 1 x i32> %res
 }
 
@@ -1095,7 +1095,7 @@ define <vscale x 1 x i32> @splice_nxv1i32_offset_min(<vscale x 1 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, i32 -2)
+  %res = call <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, i32 -2)
   ret <vscale x 1 x i32> %res
 }
 
@@ -1110,17 +1110,17 @@ define <vscale x 1 x i32> @splice_nxv1i32_offset_max(<vscale x 1 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, i32 1)
+  %res = call <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, i32 1)
   ret <vscale x 1 x i32> %res
 }
 
-declare <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, i32)
+declare <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, i32)
 
 define <vscale x 2 x i32> @splice_nxv2i32_offset_zero(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b) #0 {
 ; CHECK-LABEL: splice_nxv2i32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i32 0)
+  %res = call <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i32 0)
   ret <vscale x 2 x i32> %res
 }
 
@@ -1135,7 +1135,7 @@ define <vscale x 2 x i32> @splice_nxv2i32_offset_negone(<vscale x 2 x i32> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i32 -1)
+  %res = call <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i32 -1)
   ret <vscale x 2 x i32> %res
 }
 
@@ -1150,7 +1150,7 @@ define <vscale x 2 x i32> @splice_nxv2i32_offset_min(<vscale x 2 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i32 -4)
+  %res = call <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i32 -4)
   ret <vscale x 2 x i32> %res
 }
 
@@ -1165,17 +1165,17 @@ define <vscale x 2 x i32> @splice_nxv2i32_offset_max(<vscale x 2 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i32 3)
+  %res = call <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i32 3)
   ret <vscale x 2 x i32> %res
 }
 
-declare <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+declare <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
 
 define <vscale x 4 x i32> @splice_nxv4i32_offset_zero(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: splice_nxv4i32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 0)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 0)
   ret <vscale x 4 x i32> %res
 }
 
@@ -1190,7 +1190,7 @@ define <vscale x 4 x i32> @splice_nxv4i32_offset_negone(<vscale x 4 x i32> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -1)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -1)
   ret <vscale x 4 x i32> %res
 }
 
@@ -1205,7 +1205,7 @@ define <vscale x 4 x i32> @splice_nxv4i32_offset_min(<vscale x 4 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -8)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -8)
   ret <vscale x 4 x i32> %res
 }
 
@@ -1220,17 +1220,17 @@ define <vscale x 4 x i32> @splice_nxv4i32_offset_max(<vscale x 4 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 7)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 7)
   ret <vscale x 4 x i32> %res
 }
 
-declare <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
+declare <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
 
 define <vscale x 8 x i32> @splice_nxv8i32_offset_zero(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b) #0 {
 ; CHECK-LABEL: splice_nxv8i32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 0)
+  %res = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 0)
   ret <vscale x 8 x i32> %res
 }
 
@@ -1244,7 +1244,7 @@ define <vscale x 8 x i32> @splice_nxv8i32_offset_negone(<vscale x 8 x i32> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 -1)
+  %res = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 -1)
   ret <vscale x 8 x i32> %res
 }
 
@@ -1258,7 +1258,7 @@ define <vscale x 8 x i32> @splice_nxv8i32_offset_min(<vscale x 8 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 -16)
+  %res = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 -16)
   ret <vscale x 8 x i32> %res
 }
 
@@ -1272,17 +1272,17 @@ define <vscale x 8 x i32> @splice_nxv8i32_offset_max(<vscale x 8 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 15)
+  %res = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 15)
   ret <vscale x 8 x i32> %res
 }
 
-declare <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, i32)
+declare <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, i32)
 
 define <vscale x 16 x i32> @splice_nxv16i32_offset_zero(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b) #0 {
 ; CHECK-LABEL: splice_nxv16i32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, i32 0)
+  %res = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, i32 0)
   ret <vscale x 16 x i32> %res
 }
 
@@ -1297,7 +1297,7 @@ define <vscale x 16 x i32> @splice_nxv16i32_offset_negone(<vscale x 16 x i32> %a
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, i32 -1)
+  %res = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, i32 -1)
   ret <vscale x 16 x i32> %res
 }
 
@@ -1313,7 +1313,7 @@ define <vscale x 16 x i32> @splice_nxv16i32_offset_min(<vscale x 16 x i32> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, i32 -32)
+  %res = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, i32 -32)
   ret <vscale x 16 x i32> %res
 }
 
@@ -1328,17 +1328,17 @@ define <vscale x 16 x i32> @splice_nxv16i32_offset_max(<vscale x 16 x i32> %a, <
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, i32 31)
+  %res = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, i32 31)
   ret <vscale x 16 x i32> %res
 }
 
-declare <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, i32)
+declare <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, i32)
 
 define <vscale x 1 x i64> @splice_nxv1i64_offset_zero(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b) #0 {
 ; CHECK-LABEL: splice_nxv1i64_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, i32 0)
+  %res = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, i32 0)
   ret <vscale x 1 x i64> %res
 }
 
@@ -1353,7 +1353,7 @@ define <vscale x 1 x i64> @splice_nxv1i64_offset_negone(<vscale x 1 x i64> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, i32 -1)
+  %res = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, i32 -1)
   ret <vscale x 1 x i64> %res
 }
 
@@ -1368,7 +1368,7 @@ define <vscale x 1 x i64> @splice_nxv1i64_offset_min(<vscale x 1 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, i32 -2)
+  %res = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, i32 -2)
   ret <vscale x 1 x i64> %res
 }
 
@@ -1383,17 +1383,17 @@ define <vscale x 1 x i64> @splice_nxv1i64_offset_max(<vscale x 1 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, i32 1)
+  %res = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, i32 1)
   ret <vscale x 1 x i64> %res
 }
 
-declare <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
+declare <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
 
 define <vscale x 2 x i64> @splice_nxv2i64_offset_zero(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
 ; CHECK-LABEL: splice_nxv2i64_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 0)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 0)
   ret <vscale x 2 x i64> %res
 }
 
@@ -1408,7 +1408,7 @@ define <vscale x 2 x i64> @splice_nxv2i64_offset_negone(<vscale x 2 x i64> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -1)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -1)
   ret <vscale x 2 x i64> %res
 }
 
@@ -1423,7 +1423,7 @@ define <vscale x 2 x i64> @splice_nxv2i64_offset_min(<vscale x 2 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -4)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -4)
   ret <vscale x 2 x i64> %res
 }
 
@@ -1438,17 +1438,17 @@ define <vscale x 2 x i64> @splice_nxv2i64_offset_max(<vscale x 2 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 3)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 3)
   ret <vscale x 2 x i64> %res
 }
 
-declare <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i32)
+declare <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i32)
 
 define <vscale x 4 x i64> @splice_nxv4i64_offset_zero(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b) #0 {
 ; CHECK-LABEL: splice_nxv4i64_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, i32 0)
+  %res = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, i32 0)
   ret <vscale x 4 x i64> %res
 }
 
@@ -1463,7 +1463,7 @@ define <vscale x 4 x i64> @splice_nxv4i64_offset_negone(<vscale x 4 x i64> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, i32 -1)
+  %res = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, i32 -1)
   ret <vscale x 4 x i64> %res
 }
 
@@ -1478,7 +1478,7 @@ define <vscale x 4 x i64> @splice_nxv4i64_offset_min(<vscale x 4 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, i32 -8)
+  %res = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, i32 -8)
   ret <vscale x 4 x i64> %res
 }
 
@@ -1493,17 +1493,17 @@ define <vscale x 4 x i64> @splice_nxv4i64_offset_max(<vscale x 4 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, i32 7)
+  %res = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, i32 7)
   ret <vscale x 4 x i64> %res
 }
 
-declare <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, i32)
+declare <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, i32)
 
 define <vscale x 8 x i64> @splice_nxv8i64_offset_zero(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b) #0 {
 ; CHECK-LABEL: splice_nxv8i64_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, i32 0)
+  %res = call <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, i32 0)
   ret <vscale x 8 x i64> %res
 }
 
@@ -1517,7 +1517,7 @@ define <vscale x 8 x i64> @splice_nxv8i64_offset_negone(<vscale x 8 x i64> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, i32 -1)
+  %res = call <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, i32 -1)
   ret <vscale x 8 x i64> %res
 }
 
@@ -1531,7 +1531,7 @@ define <vscale x 8 x i64> @splice_nxv8i64_offset_min(<vscale x 8 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, i32 -16)
+  %res = call <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, i32 -16)
   ret <vscale x 8 x i64> %res
 }
 
@@ -1545,17 +1545,17 @@ define <vscale x 8 x i64> @splice_nxv8i64_offset_max(<vscale x 8 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, i32 15)
+  %res = call <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, i32 15)
   ret <vscale x 8 x i64> %res
 }
 
-declare <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, i32)
+declare <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, i32)
 
 define <vscale x 1 x half> @splice_nxv1f16_offset_zero(<vscale x 1 x half> %a, <vscale x 1 x half> %b) #0 {
 ; CHECK-LABEL: splice_nxv1f16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x half> %b, i32 0)
+  %res = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x half> %b, i32 0)
   ret <vscale x 1 x half> %res
 }
 
@@ -1570,7 +1570,7 @@ define <vscale x 1 x half> @splice_nxv1f16_offset_negone(<vscale x 1 x half> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x half> %b, i32 -1)
+  %res = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x half> %b, i32 -1)
   ret <vscale x 1 x half> %res
 }
 
@@ -1585,7 +1585,7 @@ define <vscale x 1 x half> @splice_nxv1f16_offset_min(<vscale x 1 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x half> %b, i32 -2)
+  %res = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x half> %b, i32 -2)
   ret <vscale x 1 x half> %res
 }
 
@@ -1600,17 +1600,17 @@ define <vscale x 1 x half> @splice_nxv1f16_offset_max(<vscale x 1 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x half> %b, i32 1)
+  %res = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x half> %b, i32 1)
   ret <vscale x 1 x half> %res
 }
 
-declare <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
+declare <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
 
 define <vscale x 2 x half> @splice_nxv2f16_offset_zero(<vscale x 2 x half> %a, <vscale x 2 x half> %b) #0 {
 ; CHECK-LABEL: splice_nxv2f16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 0)
+  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 0)
   ret <vscale x 2 x half> %res
 }
 
@@ -1625,7 +1625,7 @@ define <vscale x 2 x half> @splice_nxv2f16_offset_negone(<vscale x 2 x half> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -1)
+  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -1)
   ret <vscale x 2 x half> %res
 }
 
@@ -1640,7 +1640,7 @@ define <vscale x 2 x half> @splice_nxv2f16_offset_min(<vscale x 2 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -4)
+  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -4)
   ret <vscale x 2 x half> %res
 }
 
@@ -1655,17 +1655,17 @@ define <vscale x 2 x half> @splice_nxv2f16_offset_max(<vscale x 2 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 3)
+  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 3)
   ret <vscale x 2 x half> %res
 }
 
-declare <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
+declare <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
 
 define <vscale x 4 x half> @splice_nxv4f16_offset_zero(<vscale x 4 x half> %a, <vscale x 4 x half> %b) #0 {
 ; CHECK-LABEL: splice_nxv4f16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 0)
+  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 0)
   ret <vscale x 4 x half> %res
 }
 
@@ -1680,7 +1680,7 @@ define <vscale x 4 x half> @splice_nxv4f16_offset_negone(<vscale x 4 x half> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -1)
+  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -1)
   ret <vscale x 4 x half> %res
 }
 
@@ -1695,7 +1695,7 @@ define <vscale x 4 x half> @splice_nxv4f16_offset_min(<vscale x 4 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -8)
+  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -8)
   ret <vscale x 4 x half> %res
 }
 
@@ -1710,17 +1710,17 @@ define <vscale x 4 x half> @splice_nxv4f16_offset_max(<vscale x 4 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 7)
+  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 7)
   ret <vscale x 4 x half> %res
 }
 
-declare <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
+declare <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
 
 define <vscale x 8 x half> @splice_nxv8f16_offset_zero(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
 ; CHECK-LABEL: splice_nxv8f16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 0)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 0)
   ret <vscale x 8 x half> %res
 }
 
@@ -1734,7 +1734,7 @@ define <vscale x 8 x half> @splice_nxv8f16_offset_negone(<vscale x 8 x half> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -1)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -1)
   ret <vscale x 8 x half> %res
 }
 
@@ -1748,7 +1748,7 @@ define <vscale x 8 x half> @splice_nxv8f16_offset_min(<vscale x 8 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -16)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -16)
   ret <vscale x 8 x half> %res
 }
 
@@ -1762,17 +1762,17 @@ define <vscale x 8 x half> @splice_nxv8f16_offset_max(<vscale x 8 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 15)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 15)
   ret <vscale x 8 x half> %res
 }
 
-declare <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>, i32)
+declare <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>, i32)
 
 define <vscale x 16 x half> @splice_nxv16f16_offset_zero(<vscale x 16 x half> %a, <vscale x 16 x half> %b) #0 {
 ; CHECK-LABEL: splice_nxv16f16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b, i32 0)
+  %res = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b, i32 0)
   ret <vscale x 16 x half> %res
 }
 
@@ -1787,7 +1787,7 @@ define <vscale x 16 x half> @splice_nxv16f16_offset_negone(<vscale x 16 x half>
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b, i32 -1)
+  %res = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b, i32 -1)
   ret <vscale x 16 x half> %res
 }
 
@@ -1803,7 +1803,7 @@ define <vscale x 16 x half> @splice_nxv16f16_offset_min(<vscale x 16 x half> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b, i32 -32)
+  %res = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b, i32 -32)
   ret <vscale x 16 x half> %res
 }
 
@@ -1818,17 +1818,17 @@ define <vscale x 16 x half> @splice_nxv16f16_offset_max(<vscale x 16 x half> %a,
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b, i32 31)
+  %res = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b, i32 31)
   ret <vscale x 16 x half> %res
 }
 
-declare <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half>, <vscale x 32 x half>, i32)
+declare <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half>, <vscale x 32 x half>, i32)
 
 define <vscale x 32 x half> @splice_nxv32f16_offset_zero(<vscale x 32 x half> %a, <vscale x 32 x half> %b) #0 {
 ; CHECK-LABEL: splice_nxv32f16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b, i32 0)
+  %res = call <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b, i32 0)
   ret <vscale x 32 x half> %res
 }
 
@@ -1843,7 +1843,7 @@ define <vscale x 32 x half> @splice_nxv32f16_offset_negone(<vscale x 32 x half>
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b, i32 -1)
+  %res = call <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b, i32 -1)
   ret <vscale x 32 x half> %res
 }
 
@@ -1859,7 +1859,7 @@ define <vscale x 32 x half> @splice_nxv32f16_offset_min(<vscale x 32 x half> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b, i32 -64)
+  %res = call <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b, i32 -64)
   ret <vscale x 32 x half> %res
 }
 
@@ -1875,17 +1875,17 @@ define <vscale x 32 x half> @splice_nxv32f16_offset_max(<vscale x 32 x half> %a,
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b, i32 63)
+  %res = call <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b, i32 63)
   ret <vscale x 32 x half> %res
 }
 
-declare <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float>, <vscale x 1 x float>, i32)
+declare <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float>, <vscale x 1 x float>, i32)
 
 define <vscale x 1 x float> @splice_nxv1f32_offset_zero(<vscale x 1 x float> %a, <vscale x 1 x float> %b) #0 {
 ; CHECK-LABEL: splice_nxv1f32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b, i32 0)
+  %res = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b, i32 0)
   ret <vscale x 1 x float> %res
 }
 
@@ -1900,7 +1900,7 @@ define <vscale x 1 x float> @splice_nxv1f32_offset_negone(<vscale x 1 x float> %
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b, i32 -1)
+  %res = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b, i32 -1)
   ret <vscale x 1 x float> %res
 }
 
@@ -1915,7 +1915,7 @@ define <vscale x 1 x float> @splice_nxv1f32_offset_min(<vscale x 1 x float> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b, i32 -2)
+  %res = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b, i32 -2)
   ret <vscale x 1 x float> %res
 }
 
@@ -1930,17 +1930,17 @@ define <vscale x 1 x float> @splice_nxv1f32_offset_max(<vscale x 1 x float> %a,
 ; CHECK-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b, i32 1)
+  %res = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b, i32 1)
   ret <vscale x 1 x float> %res
 }
 
-declare <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
+declare <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
 
 define <vscale x 2 x float> @splice_nxv2f32_offset_zero(<vscale x 2 x float> %a, <vscale x 2 x float> %b) #0 {
 ; CHECK-LABEL: splice_nxv2f32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 0)
+  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 0)
   ret <vscale x 2 x float> %res
 }
 
@@ -1955,7 +1955,7 @@ define <vscale x 2 x float> @splice_nxv2f32_offset_negone(<vscale x 2 x float> %
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -1)
+  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -1)
   ret <vscale x 2 x float> %res
 }
 
@@ -1970,7 +1970,7 @@ define <vscale x 2 x float> @splice_nxv2f32_offset_min(<vscale x 2 x float> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -4)
+  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -4)
   ret <vscale x 2 x float> %res
 }
 
@@ -1985,17 +1985,17 @@ define <vscale x 2 x float> @splice_nxv2f32_offset_max(<vscale x 2 x float> %a,
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 3)
+  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 3)
   ret <vscale x 2 x float> %res
 }
 
-declare <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
+declare <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
 
 define <vscale x 4 x float> @splice_nxv4f32_offset_zero(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
 ; CHECK-LABEL: splice_nxv4f32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 0)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 0)
   ret <vscale x 4 x float> %res
 }
 
@@ -2010,7 +2010,7 @@ define <vscale x 4 x float> @splice_nxv4f32_offset_negone(<vscale x 4 x float> %
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -1)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -1)
   ret <vscale x 4 x float> %res
 }
 
@@ -2025,7 +2025,7 @@ define <vscale x 4 x float> @splice_nxv4f32_offset_min(<vscale x 4 x float> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -8)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -8)
   ret <vscale x 4 x float> %res
 }
 
@@ -2040,17 +2040,17 @@ define <vscale x 4 x float> @splice_nxv4f32_offset_max(<vscale x 4 x float> %a,
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 7)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 7)
   ret <vscale x 4 x float> %res
 }
 
-declare <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>, i32)
+declare <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>, i32)
 
 define <vscale x 8 x float> @splice_nxv8f32_offset_zero(<vscale x 8 x float> %a, <vscale x 8 x float> %b) #0 {
 ; CHECK-LABEL: splice_nxv8f32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b, i32 0)
+  %res = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b, i32 0)
   ret <vscale x 8 x float> %res
 }
 
@@ -2064,7 +2064,7 @@ define <vscale x 8 x float> @splice_nxv8f32_offset_negone(<vscale x 8 x float> %
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b, i32 -1)
+  %res = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b, i32 -1)
   ret <vscale x 8 x float> %res
 }
 
@@ -2078,7 +2078,7 @@ define <vscale x 8 x float> @splice_nxv8f32_offset_min(<vscale x 8 x float> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b, i32 -16)
+  %res = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b, i32 -16)
   ret <vscale x 8 x float> %res
 }
 
@@ -2092,17 +2092,17 @@ define <vscale x 8 x float> @splice_nxv8f32_offset_max(<vscale x 8 x float> %a,
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b, i32 15)
+  %res = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b, i32 15)
   ret <vscale x 8 x float> %res
 }
 
-declare <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
+declare <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
 
 define <vscale x 16 x float> @splice_nxv16f32_offset_zero(<vscale x 16 x float> %a, <vscale x 16 x float> %b) #0 {
 ; CHECK-LABEL: splice_nxv16f32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 0)
+  %res = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 0)
   ret <vscale x 16 x float> %res
 }
 
@@ -2117,7 +2117,7 @@ define <vscale x 16 x float> @splice_nxv16f32_offset_negone(<vscale x 16 x float
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 -1)
+  %res = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 -1)
   ret <vscale x 16 x float> %res
 }
 
@@ -2133,7 +2133,7 @@ define <vscale x 16 x float> @splice_nxv16f32_offset_min(<vscale x 16 x float> %
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 -32)
+  %res = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 -32)
   ret <vscale x 16 x float> %res
 }
 
@@ -2148,17 +2148,17 @@ define <vscale x 16 x float> @splice_nxv16f32_offset_max(<vscale x 16 x float> %
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 31)
+  %res = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 31)
   ret <vscale x 16 x float> %res
 }
 
-declare <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double>, <vscale x 1 x double>, i32)
+declare <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double>, <vscale x 1 x double>, i32)
 
 define <vscale x 1 x double> @splice_nxv1f64_offset_zero(<vscale x 1 x double> %a, <vscale x 1 x double> %b) #0 {
 ; CHECK-LABEL: splice_nxv1f64_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, i32 0)
+  %res = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, i32 0)
   ret <vscale x 1 x double> %res
 }
 
@@ -2173,7 +2173,7 @@ define <vscale x 1 x double> @splice_nxv1f64_offset_negone(<vscale x 1 x double>
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, i32 -1)
+  %res = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, i32 -1)
   ret <vscale x 1 x double> %res
 }
 
@@ -2188,7 +2188,7 @@ define <vscale x 1 x double> @splice_nxv1f64_offset_min(<vscale x 1 x double> %a
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, i32 -2)
+  %res = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, i32 -2)
   ret <vscale x 1 x double> %res
 }
 
@@ -2203,17 +2203,17 @@ define <vscale x 1 x double> @splice_nxv1f64_offset_max(<vscale x 1 x double> %a
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, i32 1)
+  %res = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, i32 1)
   ret <vscale x 1 x double> %res
 }
 
-declare <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
+declare <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
 
 define <vscale x 2 x double> @splice_nxv2f64_offset_zero(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
 ; CHECK-LABEL: splice_nxv2f64_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 0)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 0)
   ret <vscale x 2 x double> %res
 }
 
@@ -2228,7 +2228,7 @@ define <vscale x 2 x double> @splice_nxv2f64_offset_negone(<vscale x 2 x double>
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -1)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -1)
   ret <vscale x 2 x double> %res
 }
 
@@ -2243,7 +2243,7 @@ define <vscale x 2 x double> @splice_nxv2f64_offset_min(<vscale x 2 x double> %a
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -4)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -4)
   ret <vscale x 2 x double> %res
 }
 
@@ -2258,17 +2258,17 @@ define <vscale x 2 x double> @splice_nxv2f64_offset_max(<vscale x 2 x double> %a
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 3)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 3)
   ret <vscale x 2 x double> %res
 }
 
-declare <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>, i32)
+declare <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>, i32)
 
 define <vscale x 4 x double> @splice_nxv4f64_offset_zero(<vscale x 4 x double> %a, <vscale x 4 x double> %b) #0 {
 ; CHECK-LABEL: splice_nxv4f64_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b, i32 0)
+  %res = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b, i32 0)
   ret <vscale x 4 x double> %res
 }
 
@@ -2283,7 +2283,7 @@ define <vscale x 4 x double> @splice_nxv4f64_offset_negone(<vscale x 4 x double>
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b, i32 -1)
+  %res = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b, i32 -1)
   ret <vscale x 4 x double> %res
 }
 
@@ -2298,7 +2298,7 @@ define <vscale x 4 x double> @splice_nxv4f64_offset_min(<vscale x 4 x double> %a
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b, i32 -8)
+  %res = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b, i32 -8)
   ret <vscale x 4 x double> %res
 }
 
@@ -2313,17 +2313,17 @@ define <vscale x 4 x double> @splice_nxv4f64_offset_max(<vscale x 4 x double> %a
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b, i32 7)
+  %res = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b, i32 7)
   ret <vscale x 4 x double> %res
 }
 
-declare <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double>, <vscale x 8 x double>, i32)
+declare <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double>, <vscale x 8 x double>, i32)
 
 define <vscale x 8 x double> @splice_nxv8f64_offset_zero(<vscale x 8 x double> %a, <vscale x 8 x double> %b) #0 {
 ; CHECK-LABEL: splice_nxv8f64_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b, i32 0)
+  %res = call <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b, i32 0)
   ret <vscale x 8 x double> %res
 }
 
@@ -2337,7 +2337,7 @@ define <vscale x 8 x double> @splice_nxv8f64_offset_negone(<vscale x 8 x double>
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b, i32 -1)
+  %res = call <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b, i32 -1)
   ret <vscale x 8 x double> %res
 }
 
@@ -2351,7 +2351,7 @@ define <vscale x 8 x double> @splice_nxv8f64_offset_min(<vscale x 8 x double> %a
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b, i32 -16)
+  %res = call <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b, i32 -16)
   ret <vscale x 8 x double> %res
 }
 
@@ -2365,7 +2365,7 @@ define <vscale x 8 x double> @splice_nxv8f64_offset_max(<vscale x 8 x double> %a
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b, i32 15)
+  %res = call <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b, i32 15)
   ret <vscale x 8 x double> %res
 }
 
diff --git a/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll b/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
index 4686361ad2fc..a0085afbaf02 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
@@ -44,12 +44,8 @@ define dso_local void @test1(ptr%buf) nounwind {
 ; CHECK-NEXT:    tileloadd 3024(%rsp,%rax), %tmm3 # 1024-byte Folded Reload
 ; CHECK-NEXT:    tileloadd (%rbx,%r15), %tmm0
 ; CHECK-NEXT:    tileloadd (%rbx,%r15), %tmm1
-; CHECK-NEXT:    # implicit-def: $rax
-; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movabsq $64, %rax
 ; CHECK-NEXT:    tilestored %tmm3, 1024(%rsp,%rax) # 1024-byte Folded Spill
 ; CHECK-NEXT:    tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm2 # 1024-byte Folded Reload
-; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; CHECK-NEXT:    tdpbssd %tmm1, %tmm0, %tmm2
 ; CHECK-NEXT:    tilestored %tmm2, (%rbx,%r15)
 ; CHECK-NEXT:    incl %r14d
@@ -111,16 +107,10 @@ define dso_local void @test1(ptr%buf) nounwind {
 ; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x9c,0x04,0xd0,0x0b,0x00,0x00]
 ; EGPR-NEXT:    tileloadd (%rbx,%r15), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x04,0x3b]
 ; EGPR-NEXT:    tileloadd (%rbx,%r15), %tmm1 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x0c,0x3b]
-; EGPR-NEXT:    # implicit-def: $rax
-; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    # encoding: [0x48,0x89,0x84,0x24,0xb8,0x03,0x00,0x00]
-; EGPR-NEXT:    movabsq $64, %rax # encoding: [0x48,0xb8,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
 ; EGPR-NEXT:    tilestored %tmm3, 1024(%rsp,%rax) # 1024-byte Folded Spill
 ; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x9c,0x04,0x00,0x04,0x00,0x00]
 ; EGPR-NEXT:    tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm2 # 1024-byte Folded Reload
 ; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x94,0x24,0x00,0x04,0x00,0x00]
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; EGPR-NEXT:    # encoding: [0x48,0x8b,0x84,0x24,0xb8,0x03,0x00,0x00]
 ; EGPR-NEXT:    tdpbssd %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x5e,0xd0]
 ; EGPR-NEXT:    tilestored %tmm2, (%rbx,%r15) # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7a,0x4b,0x14,0x3b]
 ; EGPR-NEXT:    incl %r14d # encoding: [0x41,0xff,0xc6]
diff --git a/llvm/test/CodeGen/X86/avgceils.ll b/llvm/test/CodeGen/X86/avgceils.ll
index 4529ea275df9..f44f98c2a41a 100644
--- a/llvm/test/CodeGen/X86/avgceils.ll
+++ b/llvm/test/CodeGen/X86/avgceils.ll
@@ -9,7 +9,7 @@
 ; 128-bit vectors
 ;
 
-define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -65,7 +65,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %res
 }
 
-define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
@@ -165,7 +165,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %res
 }
 
-define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -190,7 +190,7 @@ define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
@@ -289,7 +289,7 @@ define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 
-define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v4i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -314,7 +314,7 @@ define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
@@ -410,7 +410,7 @@ define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 
-define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_fixed_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
@@ -472,7 +472,7 @@ define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
   ret <2 x i64> %res
 }
 
-define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movq %xmm0, %rax
@@ -574,7 +574,7 @@ define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
 ; 256-bit vectors
 ;
 
-define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v32i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -649,7 +649,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
   ret <32 x i8> %res
 }
 
-define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v32i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
@@ -806,7 +806,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
   ret <32 x i8> %res
 }
 
-define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -858,7 +858,7 @@ define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
@@ -1014,7 +1014,7 @@ define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %res
 }
 
-define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -1066,7 +1066,7 @@ define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
@@ -1218,7 +1218,7 @@ define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret <8 x i32> %res
 }
 
-define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_fixed_v4i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
@@ -1306,27 +1306,15 @@ define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v4i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    pushq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    pushq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    pushq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    pushq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    pushq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 56
-; SSE2-NEXT:    .cfi_offset %rbx, -56
-; SSE2-NEXT:    .cfi_offset %r12, -48
-; SSE2-NEXT:    .cfi_offset %r13, -40
-; SSE2-NEXT:    .cfi_offset %r14, -32
-; SSE2-NEXT:    .cfi_offset %r15, -24
-; SSE2-NEXT:    .cfi_offset %rbp, -16
 ; SSE2-NEXT:    movq %xmm0, %r11
 ; SSE2-NEXT:    movq %r11, %r12
 ; SSE2-NEXT:    sarq $63, %r12
@@ -1382,39 +1370,21 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 ; SSE2-NEXT:    popq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    popq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    popq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    popq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    popq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    popq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_ext_v4i64:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pushq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    pushq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    pushq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    pushq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    pushq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    pushq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
-; SSE4-NEXT:    .cfi_offset %rbx, -56
-; SSE4-NEXT:    .cfi_offset %r12, -48
-; SSE4-NEXT:    .cfi_offset %r13, -40
-; SSE4-NEXT:    .cfi_offset %r14, -32
-; SSE4-NEXT:    .cfi_offset %r15, -24
-; SSE4-NEXT:    .cfi_offset %rbp, -16
 ; SSE4-NEXT:    pextrq $1, %xmm0, %r11
 ; SSE4-NEXT:    movq %r11, %r12
 ; SSE4-NEXT:    sarq $63, %r12
@@ -1466,39 +1436,21 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 ; SSE4-NEXT:    popq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    popq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    popq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    popq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    popq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    popq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 8
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_ext_v4i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    pushq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    pushq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    pushq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    pushq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    pushq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 56
-; AVX1-NEXT:    .cfi_offset %rbx, -56
-; AVX1-NEXT:    .cfi_offset %r12, -48
-; AVX1-NEXT:    .cfi_offset %r13, -40
-; AVX1-NEXT:    .cfi_offset %r14, -32
-; AVX1-NEXT:    .cfi_offset %r15, -24
-; AVX1-NEXT:    .cfi_offset %rbp, -16
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vpextrq $1, %xmm2, %r11
 ; AVX1-NEXT:    movq %r11, %r12
@@ -1553,39 +1505,21 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    popq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    popq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    popq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    popq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    popq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    popq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_ext_v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 56
-; AVX2-NEXT:    .cfi_offset %rbx, -56
-; AVX2-NEXT:    .cfi_offset %r12, -48
-; AVX2-NEXT:    .cfi_offset %r13, -40
-; AVX2-NEXT:    .cfi_offset %r14, -32
-; AVX2-NEXT:    .cfi_offset %r15, -24
-; AVX2-NEXT:    .cfi_offset %rbp, -16
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX2-NEXT:    vpextrq $1, %xmm2, %r11
 ; AVX2-NEXT:    movq %r11, %r12
@@ -1640,39 +1574,21 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 8
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_ext_v4i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 56
-; AVX512-NEXT:    .cfi_offset %rbx, -56
-; AVX512-NEXT:    .cfi_offset %r12, -48
-; AVX512-NEXT:    .cfi_offset %r13, -40
-; AVX512-NEXT:    .cfi_offset %r14, -32
-; AVX512-NEXT:    .cfi_offset %r15, -24
-; AVX512-NEXT:    .cfi_offset %rbp, -16
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512-NEXT:    vpextrq $1, %xmm2, %r11
 ; AVX512-NEXT:    movq %r11, %r12
@@ -1727,17 +1643,11 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512-NEXT:    retq
   %x0 = sext <4 x i64> %a0 to <4 x i128>
   %x1 = sext <4 x i64> %a1 to <4 x i128>
@@ -1752,7 +1662,7 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; 512-bit vectors
 ;
 
-define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v64i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm11
@@ -1864,7 +1774,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v64i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm3, %xmm8
@@ -2144,7 +2054,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
   ret <64 x i8> %res
 }
 
-define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v32i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -2220,7 +2130,7 @@ define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v32i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3]
@@ -2498,7 +2408,7 @@ define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %res
 }
 
-define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -2574,7 +2484,7 @@ define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm0[2,3,2,3]
@@ -2848,7 +2758,7 @@ define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
   ret <16 x i32> %res
 }
 
-define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_fixed_v8i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm3, %xmm8
@@ -2985,29 +2895,16 @@ define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    pushq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    pushq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    pushq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    pushq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    pushq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 56
 ; SSE2-NEXT:    pushq %rax
-; SSE2-NEXT:    .cfi_def_cfa_offset 64
-; SSE2-NEXT:    .cfi_offset %rbx, -56
-; SSE2-NEXT:    .cfi_offset %r12, -48
-; SSE2-NEXT:    .cfi_offset %r13, -40
-; SSE2-NEXT:    .cfi_offset %r14, -32
-; SSE2-NEXT:    .cfi_offset %r15, -24
-; SSE2-NEXT:    .cfi_offset %rbp, -16
 ; SSE2-NEXT:    movq %xmm0, %rax
 ; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; SSE2-NEXT:    sarq $63, %rax
@@ -3137,43 +3034,23 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE2-NEXT:    addq $8, %rsp
-; SSE2-NEXT:    .cfi_def_cfa_offset 56
 ; SSE2-NEXT:    popq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    popq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    popq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    popq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    popq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    popq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_ext_v8i64:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pushq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    pushq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    pushq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    pushq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    pushq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    pushq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
 ; SSE4-NEXT:    subq $16, %rsp
-; SSE4-NEXT:    .cfi_def_cfa_offset 72
-; SSE4-NEXT:    .cfi_offset %rbx, -56
-; SSE4-NEXT:    .cfi_offset %r12, -48
-; SSE4-NEXT:    .cfi_offset %r13, -40
-; SSE4-NEXT:    .cfi_offset %r14, -32
-; SSE4-NEXT:    .cfi_offset %r15, -24
-; SSE4-NEXT:    .cfi_offset %rbp, -16
 ; SSE4-NEXT:    pextrq $1, %xmm0, %rax
 ; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; SSE4-NEXT:    sarq $63, %rax
@@ -3301,43 +3178,23 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE4-NEXT:    addq $16, %rsp
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
 ; SSE4-NEXT:    popq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    popq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    popq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    popq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    popq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    popq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 8
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_ext_v8i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    pushq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    pushq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    pushq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    pushq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    pushq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 56
 ; AVX1-NEXT:    pushq %rax
-; AVX1-NEXT:    .cfi_def_cfa_offset 64
-; AVX1-NEXT:    .cfi_offset %rbx, -56
-; AVX1-NEXT:    .cfi_offset %r12, -48
-; AVX1-NEXT:    .cfi_offset %r13, -40
-; AVX1-NEXT:    .cfi_offset %r14, -32
-; AVX1-NEXT:    .cfi_offset %r15, -24
-; AVX1-NEXT:    .cfi_offset %rbp, -16
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
 ; AVX1-NEXT:    vpextrq $1, %xmm4, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -3465,43 +3322,23 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    addq $8, %rsp
-; AVX1-NEXT:    .cfi_def_cfa_offset 56
 ; AVX1-NEXT:    popq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    popq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    popq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    popq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    popq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    popq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_ext_v8i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 56
 ; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    .cfi_def_cfa_offset 64
-; AVX2-NEXT:    .cfi_offset %rbx, -56
-; AVX2-NEXT:    .cfi_offset %r12, -48
-; AVX2-NEXT:    .cfi_offset %r13, -40
-; AVX2-NEXT:    .cfi_offset %r14, -32
-; AVX2-NEXT:    .cfi_offset %r15, -24
-; AVX2-NEXT:    .cfi_offset %rbp, -16
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
 ; AVX2-NEXT:    vpextrq $1, %xmm4, %rax
 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -3629,43 +3466,23 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX2-NEXT:    addq $8, %rsp
-; AVX2-NEXT:    .cfi_def_cfa_offset 56
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 8
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_ext_v8i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 56
 ; AVX512-NEXT:    pushq %rax
-; AVX512-NEXT:    .cfi_def_cfa_offset 64
-; AVX512-NEXT:    .cfi_offset %rbx, -56
-; AVX512-NEXT:    .cfi_offset %r12, -48
-; AVX512-NEXT:    .cfi_offset %r13, -40
-; AVX512-NEXT:    .cfi_offset %r14, -32
-; AVX512-NEXT:    .cfi_offset %r15, -24
-; AVX512-NEXT:    .cfi_offset %rbp, -16
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512-NEXT:    vpextrq $1, %xmm3, %rax
@@ -3796,19 +3613,12 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    addq $8, %rsp
-; AVX512-NEXT:    .cfi_def_cfa_offset 56
 ; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512-NEXT:    retq
   %x0 = sext <8 x i64> %a0 to <8 x i128>
   %x1 = sext <8 x i64> %a1 to <8 x i128>
diff --git a/llvm/test/CodeGen/X86/avgceilu.ll b/llvm/test/CodeGen/X86/avgceilu.ll
index dee1a5a720f9..d34894cc0fbb 100644
--- a/llvm/test/CodeGen/X86/avgceilu.ll
+++ b/llvm/test/CodeGen/X86/avgceilu.ll
@@ -9,7 +9,7 @@
 ; 128-bit vectors
 ;
 
-define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgb %xmm1, %xmm0
@@ -26,7 +26,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %res
 }
 
-define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; SSE-LABEL: test_ext_v16i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgb %xmm1, %xmm0
@@ -45,7 +45,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %res
 }
 
-define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgw %xmm1, %xmm0
@@ -62,7 +62,7 @@ define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE-LABEL: test_ext_v8i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgw %xmm1, %xmm0
@@ -81,7 +81,7 @@ define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 
-define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v4i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -106,7 +106,7 @@ define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
@@ -195,7 +195,7 @@ define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 
-define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v2i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -220,7 +220,7 @@ define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
   ret <2 x i64> %res
 }
 
-define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
@@ -310,7 +310,7 @@ define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
 ; 256-bit vectors
 ;
 
-define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v32i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgb %xmm2, %xmm0
@@ -342,7 +342,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
   ret <32 x i8> %res
 }
 
-define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE-LABEL: test_ext_v32i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgb %xmm2, %xmm0
@@ -376,7 +376,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
   ret <32 x i8> %res
 }
 
-define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgw %xmm2, %xmm0
@@ -408,7 +408,7 @@ define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE-LABEL: test_ext_v16i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgw %xmm2, %xmm0
@@ -442,7 +442,7 @@ define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %res
 }
 
-define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -494,7 +494,7 @@ define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
@@ -629,7 +629,7 @@ define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret <8 x i32> %res
 }
 
-define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v4i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -681,7 +681,7 @@ define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v4i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
@@ -937,7 +937,7 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; 512-bit vectors
 ;
 
-define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v64i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgb %xmm4, %xmm0
@@ -977,7 +977,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE-LABEL: test_ext_v64i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgb %xmm4, %xmm0
@@ -1019,7 +1019,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
   ret <64 x i8> %res
 }
 
-define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v32i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgw %xmm4, %xmm0
@@ -1059,7 +1059,7 @@ define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE-LABEL: test_ext_v32i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgw %xmm4, %xmm0
@@ -1101,7 +1101,7 @@ define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %res
 }
 
-define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -1177,7 +1177,7 @@ define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm2, %xmm8
@@ -1413,7 +1413,7 @@ define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
   ret <16 x i32> %res
 }
 
-define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -1489,27 +1489,15 @@ define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    pushq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    pushq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    pushq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    pushq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    pushq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 56
-; SSE2-NEXT:    .cfi_offset %rbx, -56
-; SSE2-NEXT:    .cfi_offset %r12, -48
-; SSE2-NEXT:    .cfi_offset %r13, -40
-; SSE2-NEXT:    .cfi_offset %r14, -32
-; SSE2-NEXT:    .cfi_offset %r15, -24
-; SSE2-NEXT:    .cfi_offset %rbp, -16
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm8, %rcx
 ; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1617,39 +1605,21 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE2-NEXT:    popq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    popq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    popq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    popq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    popq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    popq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_ext_v8i64:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pushq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    pushq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    pushq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    pushq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    pushq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    pushq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
-; SSE4-NEXT:    .cfi_offset %rbx, -56
-; SSE4-NEXT:    .cfi_offset %r12, -48
-; SSE4-NEXT:    .cfi_offset %r13, -40
-; SSE4-NEXT:    .cfi_offset %r14, -32
-; SSE4-NEXT:    .cfi_offset %r15, -24
-; SSE4-NEXT:    .cfi_offset %rbp, -16
 ; SSE4-NEXT:    movq %xmm3, %rcx
 ; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; SSE4-NEXT:    movq %xmm7, %rdx
@@ -1747,39 +1717,21 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE4-NEXT:    popq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    popq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    popq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    popq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    popq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    popq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 8
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_ext_v8i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    pushq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    pushq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    pushq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    pushq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    pushq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 56
-; AVX1-NEXT:    .cfi_offset %rbx, -56
-; AVX1-NEXT:    .cfi_offset %r12, -48
-; AVX1-NEXT:    .cfi_offset %r13, -40
-; AVX1-NEXT:    .cfi_offset %r14, -32
-; AVX1-NEXT:    .cfi_offset %r15, -24
-; AVX1-NEXT:    .cfi_offset %rbp, -16
 ; AVX1-NEXT:    vmovq %xmm1, %rcx
 ; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    vmovq %xmm3, %rdx
@@ -1885,39 +1837,21 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    popq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    popq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    popq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    popq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    popq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    popq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_ext_v8i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 56
-; AVX2-NEXT:    .cfi_offset %rbx, -56
-; AVX2-NEXT:    .cfi_offset %r12, -48
-; AVX2-NEXT:    .cfi_offset %r13, -40
-; AVX2-NEXT:    .cfi_offset %r14, -32
-; AVX2-NEXT:    .cfi_offset %r15, -24
-; AVX2-NEXT:    .cfi_offset %rbp, -16
 ; AVX2-NEXT:    vmovq %xmm1, %rcx
 ; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    vmovq %xmm3, %rdx
@@ -2023,39 +1957,21 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 8
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_ext_v8i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 56
-; AVX512-NEXT:    .cfi_offset %rbx, -56
-; AVX512-NEXT:    .cfi_offset %r12, -48
-; AVX512-NEXT:    .cfi_offset %r13, -40
-; AVX512-NEXT:    .cfi_offset %r14, -32
-; AVX512-NEXT:    .cfi_offset %r15, -24
-; AVX512-NEXT:    .cfi_offset %rbp, -16
 ; AVX512-NEXT:    vmovq %xmm0, %rcx
 ; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    vmovq %xmm1, %rdx
@@ -2164,17 +2080,11 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512-NEXT:    retq
   %x0 = zext <8 x i64> %a0 to <8 x i128>
   %x1 = zext <8 x i64> %a1 to <8 x i128>
diff --git a/llvm/test/CodeGen/X86/avgfloors.ll b/llvm/test/CodeGen/X86/avgfloors.ll
index a3864ab4bb44..efee831a15c7 100644
--- a/llvm/test/CodeGen/X86/avgfloors.ll
+++ b/llvm/test/CodeGen/X86/avgfloors.ll
@@ -9,7 +9,7 @@
 ; 128-bit vectors
 ;
 
-define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -64,7 +64,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %res
 }
 
-define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
@@ -150,7 +150,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %res
 }
 
-define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -174,7 +174,7 @@ define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
@@ -259,7 +259,7 @@ define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 
-define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v4i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -283,7 +283,7 @@ define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
@@ -365,7 +365,7 @@ define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 
-define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_fixed_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
@@ -425,7 +425,7 @@ define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
   ret <2 x i64> %res
 }
 
-define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
@@ -514,7 +514,7 @@ define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
 ; 256-bit vectors
 ;
 
-define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v32i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -587,7 +587,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
   ret <32 x i8> %res
 }
 
-define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v32i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
@@ -723,7 +723,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
   ret <32 x i8> %res
 }
 
-define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -773,7 +773,7 @@ define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
@@ -908,7 +908,7 @@ define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %res
 }
 
-define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -958,7 +958,7 @@ define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
@@ -1089,7 +1089,7 @@ define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret <8 x i32> %res
 }
 
-define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_fixed_v4i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
@@ -1173,27 +1173,15 @@ define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v4i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    pushq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    pushq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    pushq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    pushq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    pushq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 56
-; SSE2-NEXT:    .cfi_offset %rbx, -56
-; SSE2-NEXT:    .cfi_offset %r12, -48
-; SSE2-NEXT:    .cfi_offset %r13, -40
-; SSE2-NEXT:    .cfi_offset %r14, -32
-; SSE2-NEXT:    .cfi_offset %r15, -24
-; SSE2-NEXT:    .cfi_offset %rbp, -16
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm4, %rdx
 ; SSE2-NEXT:    movq %rdx, %r14
@@ -1241,39 +1229,21 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 ; SSE2-NEXT:    popq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    popq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    popq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    popq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    popq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    popq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_ext_v4i64:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pushq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    pushq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    pushq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    pushq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    pushq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    pushq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
-; SSE4-NEXT:    .cfi_offset %rbx, -56
-; SSE4-NEXT:    .cfi_offset %r12, -48
-; SSE4-NEXT:    .cfi_offset %r13, -40
-; SSE4-NEXT:    .cfi_offset %r14, -32
-; SSE4-NEXT:    .cfi_offset %r15, -24
-; SSE4-NEXT:    .cfi_offset %rbp, -16
 ; SSE4-NEXT:    movq %xmm1, %rdi
 ; SSE4-NEXT:    movq %rdi, %r14
 ; SSE4-NEXT:    sarq $63, %r14
@@ -1317,39 +1287,21 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 ; SSE4-NEXT:    popq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    popq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    popq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    popq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    popq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    popq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 8
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_ext_v4i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    pushq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    pushq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    pushq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    pushq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    pushq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 56
-; AVX1-NEXT:    .cfi_offset %rbx, -56
-; AVX1-NEXT:    .cfi_offset %r12, -48
-; AVX1-NEXT:    .cfi_offset %r13, -40
-; AVX1-NEXT:    .cfi_offset %r14, -32
-; AVX1-NEXT:    .cfi_offset %r15, -24
-; AVX1-NEXT:    .cfi_offset %rbp, -16
 ; AVX1-NEXT:    vmovq %xmm0, %rdx
 ; AVX1-NEXT:    movq %rdx, %r14
 ; AVX1-NEXT:    sarq $63, %r14
@@ -1396,39 +1348,21 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    popq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    popq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    popq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    popq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    popq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    popq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_ext_v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 56
-; AVX2-NEXT:    .cfi_offset %rbx, -56
-; AVX2-NEXT:    .cfi_offset %r12, -48
-; AVX2-NEXT:    .cfi_offset %r13, -40
-; AVX2-NEXT:    .cfi_offset %r14, -32
-; AVX2-NEXT:    .cfi_offset %r15, -24
-; AVX2-NEXT:    .cfi_offset %rbp, -16
 ; AVX2-NEXT:    vmovq %xmm0, %rdx
 ; AVX2-NEXT:    movq %rdx, %r14
 ; AVX2-NEXT:    sarq $63, %r14
@@ -1475,39 +1409,21 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 8
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_ext_v4i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 56
-; AVX512-NEXT:    .cfi_offset %rbx, -56
-; AVX512-NEXT:    .cfi_offset %r12, -48
-; AVX512-NEXT:    .cfi_offset %r13, -40
-; AVX512-NEXT:    .cfi_offset %r14, -32
-; AVX512-NEXT:    .cfi_offset %r15, -24
-; AVX512-NEXT:    .cfi_offset %rbp, -16
 ; AVX512-NEXT:    vmovq %xmm0, %rdx
 ; AVX512-NEXT:    movq %rdx, %r14
 ; AVX512-NEXT:    sarq $63, %r14
@@ -1554,17 +1470,11 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512-NEXT:    retq
   %x0 = sext <4 x i64> %a0 to <4 x i128>
   %x1 = sext <4 x i64> %a1 to <4 x i128>
@@ -1578,7 +1488,7 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; 512-bit vectors
 ;
 
-define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v64i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm10
@@ -1690,7 +1600,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v64i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
@@ -1934,7 +1844,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
   ret <64 x i8> %res
 }
 
-define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v32i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -2007,7 +1917,7 @@ define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v32i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm3, %xmm9
@@ -2251,7 +2161,7 @@ define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %res
 }
 
-define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -2324,7 +2234,7 @@ define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm8, %xmm8
@@ -2561,7 +2471,7 @@ define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
   ret <16 x i32> %res
 }
 
-define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_fixed_v8i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm3, %xmm11
@@ -2698,29 +2608,16 @@ define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    pushq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    pushq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    pushq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    pushq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    pushq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 56
 ; SSE2-NEXT:    pushq %rax
-; SSE2-NEXT:    .cfi_def_cfa_offset 64
-; SSE2-NEXT:    .cfi_offset %rbx, -56
-; SSE2-NEXT:    .cfi_offset %r12, -48
-; SSE2-NEXT:    .cfi_offset %r13, -40
-; SSE2-NEXT:    .cfi_offset %r14, -32
-; SSE2-NEXT:    .cfi_offset %r15, -24
-; SSE2-NEXT:    .cfi_offset %rbp, -16
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm8, %rax
 ; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2832,43 +2729,23 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE2-NEXT:    addq $8, %rsp
-; SSE2-NEXT:    .cfi_def_cfa_offset 56
 ; SSE2-NEXT:    popq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    popq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    popq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    popq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    popq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    popq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_ext_v8i64:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pushq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    pushq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    pushq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    pushq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    pushq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    pushq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
 ; SSE4-NEXT:    pushq %rax
-; SSE4-NEXT:    .cfi_def_cfa_offset 64
-; SSE4-NEXT:    .cfi_offset %rbx, -56
-; SSE4-NEXT:    .cfi_offset %r12, -48
-; SSE4-NEXT:    .cfi_offset %r13, -40
-; SSE4-NEXT:    .cfi_offset %r14, -32
-; SSE4-NEXT:    .cfi_offset %r15, -24
-; SSE4-NEXT:    .cfi_offset %rbp, -16
 ; SSE4-NEXT:    movq %xmm3, %rax
 ; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; SSE4-NEXT:    movq %rax, %rcx
@@ -2972,43 +2849,23 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE4-NEXT:    addq $8, %rsp
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
 ; SSE4-NEXT:    popq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    popq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    popq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    popq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    popq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    popq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 8
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_ext_v8i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    pushq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    pushq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    pushq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    pushq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    pushq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 56
 ; AVX1-NEXT:    pushq %rax
-; AVX1-NEXT:    .cfi_def_cfa_offset 64
-; AVX1-NEXT:    .cfi_offset %rbx, -56
-; AVX1-NEXT:    .cfi_offset %r12, -48
-; AVX1-NEXT:    .cfi_offset %r13, -40
-; AVX1-NEXT:    .cfi_offset %r14, -32
-; AVX1-NEXT:    .cfi_offset %r15, -24
-; AVX1-NEXT:    .cfi_offset %rbp, -16
 ; AVX1-NEXT:    vmovq %xmm1, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    movq %rax, %rcx
@@ -3118,43 +2975,23 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    addq $8, %rsp
-; AVX1-NEXT:    .cfi_def_cfa_offset 56
 ; AVX1-NEXT:    popq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    popq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    popq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    popq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    popq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    popq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_ext_v8i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 56
 ; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    .cfi_def_cfa_offset 64
-; AVX2-NEXT:    .cfi_offset %rbx, -56
-; AVX2-NEXT:    .cfi_offset %r12, -48
-; AVX2-NEXT:    .cfi_offset %r13, -40
-; AVX2-NEXT:    .cfi_offset %r14, -32
-; AVX2-NEXT:    .cfi_offset %r15, -24
-; AVX2-NEXT:    .cfi_offset %rbp, -16
 ; AVX2-NEXT:    vmovq %xmm1, %rax
 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    movq %rax, %rcx
@@ -3264,43 +3101,23 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX2-NEXT:    addq $8, %rsp
-; AVX2-NEXT:    .cfi_def_cfa_offset 56
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 8
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_ext_v8i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 56
 ; AVX512-NEXT:    pushq %rax
-; AVX512-NEXT:    .cfi_def_cfa_offset 64
-; AVX512-NEXT:    .cfi_offset %rbx, -56
-; AVX512-NEXT:    .cfi_offset %r12, -48
-; AVX512-NEXT:    .cfi_offset %r13, -40
-; AVX512-NEXT:    .cfi_offset %r14, -32
-; AVX512-NEXT:    .cfi_offset %r15, -24
-; AVX512-NEXT:    .cfi_offset %rbp, -16
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    movq %rax, %rcx
@@ -3413,19 +3230,12 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    addq $8, %rsp
-; AVX512-NEXT:    .cfi_def_cfa_offset 56
 ; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512-NEXT:    retq
   %x0 = sext <8 x i64> %a0 to <8 x i128>
   %x1 = sext <8 x i64> %a1 to <8 x i128>
diff --git a/llvm/test/CodeGen/X86/avgflooru.ll b/llvm/test/CodeGen/X86/avgflooru.ll
index e07c1f55991e..000457c5ab1e 100644
--- a/llvm/test/CodeGen/X86/avgflooru.ll
+++ b/llvm/test/CodeGen/X86/avgflooru.ll
@@ -9,7 +9,7 @@
 ; 128-bit vectors
 ;
 
-define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -53,7 +53,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %res
 }
 
-define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
@@ -127,7 +127,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %res
 }
 
-define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -151,7 +151,7 @@ define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
@@ -227,7 +227,7 @@ define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 
-define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v4i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -251,7 +251,7 @@ define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
@@ -325,7 +325,7 @@ define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 
-define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v2i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -349,7 +349,7 @@ define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
   ret <2 x i64> %res
 }
 
-define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
@@ -458,7 +458,7 @@ define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
 ; 256-bit vectors
 ;
 
-define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v32i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -516,7 +516,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
   ret <32 x i8> %res
 }
 
-define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v32i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
@@ -627,7 +627,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
   ret <32 x i8> %res
 }
 
-define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -677,7 +677,7 @@ define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
@@ -792,7 +792,7 @@ define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %res
 }
 
-define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -842,7 +842,7 @@ define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
@@ -954,7 +954,7 @@ define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret <8 x i32> %res
 }
 
-define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v4i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -1004,7 +1004,7 @@ define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v4i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
@@ -1199,7 +1199,7 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; 512-bit vectors
 ;
 
-define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v64i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm9
@@ -1286,7 +1286,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v64i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm8, %xmm8
@@ -1481,7 +1481,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
   ret <64 x i8> %res
 }
 
-define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v32i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -1554,7 +1554,7 @@ define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v32i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm8, %xmm8
@@ -1757,7 +1757,7 @@ define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %res
 }
 
-define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -1830,7 +1830,7 @@ define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm8, %xmm8
@@ -2027,7 +2027,7 @@ define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
   ret <16 x i32> %res
 }
 
-define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -2100,27 +2100,15 @@ define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    pushq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    pushq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    pushq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    pushq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    pushq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 56
-; SSE2-NEXT:    .cfi_offset %rbx, -56
-; SSE2-NEXT:    .cfi_offset %r12, -48
-; SSE2-NEXT:    .cfi_offset %r13, -40
-; SSE2-NEXT:    .cfi_offset %r14, -32
-; SSE2-NEXT:    .cfi_offset %r15, -24
-; SSE2-NEXT:    .cfi_offset %rbp, -16
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm3, %rbx
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
@@ -2194,39 +2182,21 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE2-NEXT:    popq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    popq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    popq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    popq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    popq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    popq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_ext_v8i64:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pushq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    pushq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    pushq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    pushq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    pushq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    pushq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
-; SSE4-NEXT:    .cfi_offset %rbx, -56
-; SSE4-NEXT:    .cfi_offset %r12, -48
-; SSE4-NEXT:    .cfi_offset %r13, -40
-; SSE4-NEXT:    .cfi_offset %r14, -32
-; SSE4-NEXT:    .cfi_offset %r15, -24
-; SSE4-NEXT:    .cfi_offset %rbp, -16
 ; SSE4-NEXT:    pextrq $1, %xmm3, %r14
 ; SSE4-NEXT:    movq %xmm2, %r13
 ; SSE4-NEXT:    pextrq $1, %xmm2, %rbp
@@ -2292,39 +2262,21 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE4-NEXT:    popq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    popq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    popq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    popq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    popq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    popq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 8
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_ext_v8i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    pushq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    pushq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    pushq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    pushq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    pushq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 56
-; AVX1-NEXT:    .cfi_offset %rbx, -56
-; AVX1-NEXT:    .cfi_offset %r12, -48
-; AVX1-NEXT:    .cfi_offset %r13, -40
-; AVX1-NEXT:    .cfi_offset %r14, -32
-; AVX1-NEXT:    .cfi_offset %r15, -24
-; AVX1-NEXT:    .cfi_offset %rbp, -16
 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rbx
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
 ; AVX1-NEXT:    vmovq %xmm4, %r15
@@ -2396,39 +2348,21 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    popq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    popq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    popq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    popq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    popq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    popq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_ext_v8i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 56
-; AVX2-NEXT:    .cfi_offset %rbx, -56
-; AVX2-NEXT:    .cfi_offset %r12, -48
-; AVX2-NEXT:    .cfi_offset %r13, -40
-; AVX2-NEXT:    .cfi_offset %r14, -32
-; AVX2-NEXT:    .cfi_offset %r15, -24
-; AVX2-NEXT:    .cfi_offset %rbp, -16
 ; AVX2-NEXT:    vpextrq $1, %xmm1, %rbx
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
 ; AVX2-NEXT:    vmovq %xmm4, %r15
@@ -2500,39 +2434,21 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 8
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_ext_v8i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 56
-; AVX512-NEXT:    .cfi_offset %rbx, -56
-; AVX512-NEXT:    .cfi_offset %r12, -48
-; AVX512-NEXT:    .cfi_offset %r13, -40
-; AVX512-NEXT:    .cfi_offset %r14, -32
-; AVX512-NEXT:    .cfi_offset %r15, -24
-; AVX512-NEXT:    .cfi_offset %rbp, -16
 ; AVX512-NEXT:    vpextrq $1, %xmm0, %r10
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512-NEXT:    vpextrq $1, %xmm2, %r13
@@ -2607,17 +2523,11 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512-NEXT:    retq
   %x0 = zext <8 x i64> %a0 to <8 x i128>
   %x1 = zext <8 x i64> %a1 to <8 x i128>
diff --git a/llvm/test/CodeGen/X86/avx512bwvl-arith.ll b/llvm/test/CodeGen/X86/avx512bwvl-arith.ll
index 4988fc35b10e..33819c9e0102 100644
--- a/llvm/test/CodeGen/X86/avx512bwvl-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512bwvl-arith.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,-evex512 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,EVEX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,-evex512 | FileCheck %s --check-prefixes=CHECK,EVEX256
 
 ; 256-bit
 
@@ -236,3 +236,34 @@ define <8 x i16> @vpmullw128_test(<8 x i16> %i, <8 x i16> %j) {
   ret <8 x i16> %x
 }
 
+define i16 @PR90356(<16 x i1> %a) {
+; EVEX512-LABEL: PR90356:
+; EVEX512:       # %bb.0:
+; EVEX512-NEXT:    vpsllw $7, %xmm0, %xmm0
+; EVEX512-NEXT:    vpmovb2m %xmm0, %k1
+; EVEX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; EVEX512-NEXT:    movb $63, %al
+; EVEX512-NEXT:    kmovd %eax, %k1
+; EVEX512-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
+; EVEX512-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; EVEX512-NEXT:    kmovd %k0, %eax
+; EVEX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; EVEX512-NEXT:    vzeroupper
+; EVEX512-NEXT:    retq
+;
+; EVEX256-LABEL: PR90356:
+; EVEX256:       # %bb.0:
+; EVEX256-NEXT:    vpsllw $7, %xmm0, %xmm0
+; EVEX256-NEXT:    vpmovb2m %xmm0, %k0
+; EVEX256-NEXT:    vpmovm2w %k0, %ymm0
+; EVEX256-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; EVEX256-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; EVEX256-NEXT:    vpmovw2m %ymm0, %k0
+; EVEX256-NEXT:    kmovd %k0, %eax
+; EVEX256-NEXT:    # kill: def $ax killed $ax killed $eax
+; EVEX256-NEXT:    vzeroupper
+; EVEX256-NEXT:    retq
+  %1 = shufflevector <16 x i1> %a, <16 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31>
+  %2 = bitcast <16 x i1> %1 to i16
+  ret i16 %2
+}
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index bf7c1c00c71d..33cc8e96f663 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -178,18 +178,18 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $152, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    sarl $31, %ebx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    xorl %ecx, %esi
-; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %esi, %ebp
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, %esi
@@ -198,28 +198,29 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ebx
-; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    xorl %ebp, %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    subl %edx, %ebp
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    subl %ebx, %ebp
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    sbbl %ebx, %edx
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -230,91 +231,87 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    bsrl %edi, %edx
+; X86-NEXT:    bsrl %esi, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    bsrl %ebx, %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    addl $32, %ecx
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    bsrl %esi, %edx
+; X86-NEXT:    bsrl %edi, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    bsrl %ebp, %ebp
 ; X86-NEXT:    xorl $31, %ebp
 ; X86-NEXT:    addl $32, %ebp
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    cmovnel %edx, %ebp
 ; X86-NEXT:    addl $64, %ebp
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %ebx
 ; X86-NEXT:    cmovnel %ecx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    bsrl %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    bsrl %eax, %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    addl $32, %ecx
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    bsrl %ebx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    bsrl %edi, %esi
 ; X86-NEXT:    xorl $31, %esi
 ; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    addl $32, %edx
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    cmovnel %esi, %edx
 ; X86-NEXT:    addl $64, %edx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    orl %ebx, %esi
 ; X86-NEXT:    cmovnel %ecx, %edx
 ; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    subl %edx, %ebp
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %edi, %edi
 ; X86-NEXT:    movl $127, %ecx
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    cmpl %ebp, %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %edi, %ecx
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edi, %ecx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    cmovnel %esi, %edi
-; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %edx
 ; X86-NEXT:    cmovnel %esi, %edx
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT:    cmovnel %esi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    cmovnel %esi, %eax
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    jne .LBB4_1
-; X86-NEXT:  # %bb.8: # %_udiv-special-cases
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    xorl $127, %ebp
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    jne .LBB4_8
+; X86-NEXT:  # %bb.1: # %_udiv-special-cases
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    xorl $127, %edi
+; X86-NEXT:    orl %ebp, %edi
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    je .LBB4_9
-; X86-NEXT:  # %bb.5: # %udiv-bb1
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    je .LBB4_8
+; X86-NEXT:  # %bb.2: # %udiv-bb1
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -344,225 +341,224 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    notb %cl
 ; X86-NEXT:    movl 140(%esp,%edi), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    shrl %esi
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    movl 136(%esp,%edi), %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    shrl %ebx
+; X86-NEXT:    shrl %cl, %ebx
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    movl 136(%esp,%edi), %edx
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    jae .LBB4_2
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    jae .LBB4_3
 ; X86-NEXT:  # %bb.6:
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    jmp .LBB4_7
-; X86-NEXT:  .LBB4_1:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    jmp .LBB4_9
-; X86-NEXT:  .LBB4_2: # %udiv-preheader
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:  .LBB4_3: # %udiv-preheader
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movb %bl, %ch
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movb %dl, %ch
 ; X86-NEXT:    andb $7, %ch
-; X86-NEXT:    movb %bl, %cl
+; X86-NEXT:    movb %dl, %cl
 ; X86-NEXT:    shrb $3, %cl
 ; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl %cl, %ebx
-; X86-NEXT:    movl 100(%esp,%ebx), %ebp
-; X86-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-NEXT:    movl 96(%esp,%ebx), %edi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movzbl %cl, %edx
+; X86-NEXT:    movl 100(%esp,%edx), %esi
+; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl 96(%esp,%edx), %edi
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %ebp
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrdl %cl, %ebp, %edx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 88(%esp,%ebx), %edx
-; X86-NEXT:    movl 92(%esp,%ebx), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    shrl %cl, %eax
+; X86-NEXT:    shrdl %cl, %esi, %ebp
+; X86-NEXT:    movl 88(%esp,%edx), %ebx
+; X86-NEXT:    movl 92(%esp,%edx), %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    notb %cl
 ; X86-NEXT:    addl %edi, %edi
 ; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    orl %edx, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movb %ch, %cl
 ; X86-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    shrdl %cl, %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB4_3: # %udiv-do-while
+; X86-NEXT:  .LBB4_4: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %edx
+; X86-NEXT:    movl %ebp, %esi
 ; X86-NEXT:    shldl $1, %ebp, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebp, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, %ebp
-; X86-NEXT:    shldl $1, %edi, %ebx
+; X86-NEXT:    shldl $1, %ebp, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    shldl $1, %ecx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    orl %ebx, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %ecx
-; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    orl %ebx, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $1, %esi, %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %esi, %esi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %eax
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl %ebp, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl (%esp), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl $1, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    subl %ecx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    sbbl %eax, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %esi, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl $-1, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl $-1, %edi
 ; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %edi
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %ecx
 ; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    jne .LBB4_3
-; X86-NEXT:  # %bb.4:
+; X86-NEXT:    jne .LBB4_4
+; X86-NEXT:  # %bb.5:
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    shldl $1, %eax, %edx
+; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
+; X86-NEXT:    shldl $1, %ebx, %edx
 ; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    shldl $1, %eax, %ebx
+; X86-NEXT:    orl %ecx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    shldl $1, %esi, %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    addl %esi, %esi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:  .LBB4_9: # %udiv-end
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %eax
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    subl %ebx, %esi
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:  .LBB4_8: # %udiv-end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    xorl %ecx, %ebx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    xorl %ecx, %esi
+; X86-NEXT:    subl %ecx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    sbbl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, (%ebp)
-; X86-NEXT:    movl %eax, 4(%ebp)
-; X86-NEXT:    movl %edx, 8(%ebp)
-; X86-NEXT:    movl %edi, 12(%ebp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    sbbl %ecx, %ebx
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %esi, (%ecx)
+; X86-NEXT:    movl %eax, 4(%ecx)
+; X86-NEXT:    movl %ebx, 8(%ecx)
+; X86-NEXT:    movl %edx, 12(%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull %eax, %ecx
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    imull %esi, %ecx
@@ -572,7 +568,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    addl %edx, %ebp
 ; X86-NEXT:    addl %ecx, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl %ebx, %ebp
+; X86-NEXT:    adcl %edi, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index b212e9438e1b..c79da37988e4 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -459,8 +459,7 @@ define i32 @freeze_ashr(i32 %a0) nounwind {
 ; X64-LABEL: freeze_ashr:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    sarl $3, %eax
-; X64-NEXT:    sarl $3, %eax
+; X64-NEXT:    sarl $6, %eax
 ; X64-NEXT:    retq
   %x = ashr i32 %a0, 3
   %y = freeze i32 %x
@@ -531,30 +530,12 @@ define i32 @freeze_ashr_outofrange(i32 %a0) nounwind {
 define <8 x i16> @freeze_ashr_vec(<8 x i16> %a0) nounwind {
 ; X86-LABEL: freeze_ashr_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movdqa %xmm0, %xmm2
-; X86-NEXT:    psraw $1, %xmm2
-; X86-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
-; X86-NEXT:    movdqa %xmm1, %xmm3
-; X86-NEXT:    pandn %xmm2, %xmm3
-; X86-NEXT:    psraw $3, %xmm0
-; X86-NEXT:    pand %xmm1, %xmm0
-; X86-NEXT:    por %xmm3, %xmm0
-; X86-NEXT:    movdqa %xmm0, %xmm2
-; X86-NEXT:    psraw $3, %xmm2
-; X86-NEXT:    psraw $1, %xmm0
-; X86-NEXT:    pand %xmm1, %xmm0
-; X86-NEXT:    pandn %xmm2, %xmm1
-; X86-NEXT:    por %xmm1, %xmm0
+; X86-NEXT:    psraw $4, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_ashr_vec:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpsraw $1, %xmm0, %xmm1
-; X64-NEXT:    vpsraw $3, %xmm0, %xmm0
-; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; X64-NEXT:    vpsraw $3, %xmm0, %xmm1
-; X64-NEXT:    vpsraw $1, %xmm0, %xmm0
-; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; X64-NEXT:    vpsraw $4, %xmm0, %xmm0
 ; X64-NEXT:    retq
   %x = ashr <8 x i16> %a0, <i16 3, i16 1, i16 3, i16 1, i16 3, i16 1, i16 3, i16 1>
   %y = freeze <8 x i16> %x
@@ -592,8 +573,7 @@ define i32 @freeze_lshr(i32 %a0) nounwind {
 ; X64-LABEL: freeze_lshr:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrl $2, %eax
-; X64-NEXT:    shrl %eax
+; X64-NEXT:    shrl $3, %eax
 ; X64-NEXT:    retq
   %x = lshr i32 %a0, 2
   %y = freeze i32 %x
@@ -664,30 +644,12 @@ define i32 @freeze_lshr_outofrange(i32 %a0) nounwind {
 define <8 x i16> @freeze_lshr_vec(<8 x i16> %a0) nounwind {
 ; X86-LABEL: freeze_lshr_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movdqa %xmm0, %xmm2
-; X86-NEXT:    psrlw $1, %xmm2
-; X86-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
-; X86-NEXT:    movdqa %xmm1, %xmm3
-; X86-NEXT:    pandn %xmm2, %xmm3
-; X86-NEXT:    psrlw $2, %xmm0
-; X86-NEXT:    pand %xmm1, %xmm0
-; X86-NEXT:    por %xmm3, %xmm0
-; X86-NEXT:    movdqa %xmm0, %xmm2
-; X86-NEXT:    psrlw $2, %xmm2
-; X86-NEXT:    psrlw $1, %xmm0
-; X86-NEXT:    pand %xmm1, %xmm0
-; X86-NEXT:    pandn %xmm2, %xmm1
-; X86-NEXT:    por %xmm1, %xmm0
+; X86-NEXT:    psrlw $3, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_lshr_vec:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpsrlw $1, %xmm0, %xmm1
-; X64-NEXT:    vpsrlw $2, %xmm0, %xmm0
-; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; X64-NEXT:    vpsrlw $2, %xmm0, %xmm1
-; X64-NEXT:    vpsrlw $1, %xmm0, %xmm0
-; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; X64-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; X64-NEXT:    retq
   %x = lshr <8 x i16> %a0, <i16 2, i16 1, i16 2, i16 1, i16 2, i16 1, i16 2, i16 1>
   %y = freeze <8 x i16> %x
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index ee7f4aea02c0..fe240286462e 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -672,3 +672,23 @@ define void @pr59677(i32 %x, ptr %out) nounwind {
   ret void
 }
 declare <4 x float> @llvm.sin.v4f32(<4 x float>)
+
+; Test that we can eliminate freeze by changing the BUILD_VECTOR to a splat
+; zero vector.
+define void @freeze_buildvector_not_simple_type(ptr %dst) nounwind {
+; X86-LABEL: freeze_buildvector_not_simple_type:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb $0, 4(%eax)
+; X86-NEXT:    movl $0, (%eax)
+; X86-NEXT:    retl
+;
+; X64-LABEL: freeze_buildvector_not_simple_type:
+; X64:       # %bb.0:
+; X64-NEXT:    movb $0, 4(%rdi)
+; X64-NEXT:    movl $0, (%rdi)
+; X64-NEXT:    retq
+  %i0 = freeze <5 x i8> <i8 poison, i8 0, i8 0, i8 undef, i8 0>
+  store <5 x i8> %i0, ptr %dst
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll
index ae1320f8b086..200a8184d4bd 100644
--- a/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefixes=X86,X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2
 
 ; This tests codegen time inlining/optimization of memcmp
 ; rdar://6480398
diff --git a/llvm/test/CodeGen/X86/memcmp-minsize.ll b/llvm/test/CodeGen/X86/memcmp-minsize.ll
index 544d1c49f26b..9c20f3e0cdef 100644
--- a/llvm/test/CodeGen/X86/memcmp-minsize.ll
+++ b/llvm/test/CodeGen/X86/memcmp-minsize.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64,X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
 
 ; This tests codegen time inlining/optimization of memcmp
 ; rdar://6480398
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
index 762691151f4b..3db6ae8b76b2 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefixes=X86,X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2
 
 ; This tests codegen time inlining/optimization of memcmp
 ; rdar://6480398
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll
index c0c7b98d471c..edd61641ad2a 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64,X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
 
 ; This tests codegen time inlining/optimization of memcmp
 ; rdar://6480398
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
index cb45fd3ebb90..1c301da26bea 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefixes=X86,X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2
 
 ; This tests codegen time inlining/optimization of memcmp
 ; rdar://6480398
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll
index 720344a22e43..1ee3317b9c96 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64,X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
 
 ; This tests codegen time inlining/optimization of memcmp
 ; rdar://6480398
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
index 2fdf6ef224ca..366dad1612b4 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
@@ -684,22 +684,21 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin
 ; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm3, %ymm4
 ; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm5
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT:    vpminsb %ymm2, %ymm3, %ymm5
-; AVX512F-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpsubb %ymm5, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpminsb %ymm1, %ymm0, %ymm5
 ; AVX512F-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsubb %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT:    vpminsb %ymm2, %ymm3, %ymm5
+; AVX512F-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpsubb %ymm5, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm5, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT:    vpandq %zmm6, %zmm5, %zmm5
-; AVX512F-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT:    vpsubb %ymm2, %ymm7, %ymm2
-; AVX512F-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsubb %ymm1, %ymm7, %ymm1
+; AVX512F-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT:    vpsubb %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT:    vpsubb %ymm1, %ymm6, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpternlogq $226, %zmm5, %zmm4, %zmm1
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
@@ -715,22 +714,21 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm2, %ymm3, %ymm4
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm5
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm2, %ymm3, %ymm5
-; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm1, %ymm0, %ymm5
 ; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm2, %ymm3, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpand %ymm5, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512VL-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT:    vpandq %zmm6, %zmm5, %zmm5
-; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm7, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm7, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm6, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm6, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
 ; AVX512VL-FALLBACK-NEXT:    vpternlogq $226, %zmm5, %zmm4, %zmm1
 ; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
@@ -772,20 +770,19 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw
 ; AVX512F-NEXT:    vpminub %ymm1, %ymm0, %ymm6
 ; AVX512F-NEXT:    vpcmpeqb %ymm6, %ymm0, %ymm7
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm5
-; AVX512F-NEXT:    vpmaxub %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpmaxub %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsubb %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT:    vpmaxub %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm4
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT:    vpandq %zmm6, %zmm4, %zmm4
-; AVX512F-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT:    vpsubb %ymm2, %ymm7, %ymm2
-; AVX512F-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsubb %ymm1, %ymm7, %ymm1
+; AVX512F-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT:    vpsubb %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT:    vpsubb %ymm1, %ymm6, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpternlogq $184, %zmm4, %zmm5, %zmm1
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
@@ -803,20 +800,19 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw
 ; AVX512VL-FALLBACK-NEXT:    vpminub %ymm1, %ymm0, %ymm6
 ; AVX512VL-FALLBACK-NEXT:    vpcmpeqb %ymm6, %ymm0, %ymm7
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm5
-; AVX512VL-FALLBACK-NEXT:    vpmaxub %ymm2, %ymm3, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpmaxub %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpand %ymm4, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm4
-; AVX512VL-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT:    vpandq %zmm6, %zmm4, %zmm4
-; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm7, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm7, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm6, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm6, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
 ; AVX512VL-FALLBACK-NEXT:    vpternlogq $184, %zmm4, %zmm5, %zmm1
 ; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
diff --git a/llvm/test/CodeGen/X86/named-vector-shuffle-reverse.ll b/llvm/test/CodeGen/X86/named-vector-shuffle-reverse.ll
index f0917be88744..2a5e834f0ac7 100644
--- a/llvm/test/CodeGen/X86/named-vector-shuffle-reverse.ll
+++ b/llvm/test/CodeGen/X86/named-vector-shuffle-reverse.ll
@@ -23,7 +23,7 @@ define <16 x i8> @reverse_v16i8(<16 x i8> %a) #0 {
 ; CHECK-NEXT:    packuswb %xmm2, %xmm0
 ; CHECK-NEXT:    retq
 
-  %res = call <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8> %a)
+  %res = call <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8> %a)
   ret <16 x i8> %res
 }
 
@@ -34,7 +34,7 @@ define <8 x i16> @reverse_v8i16(<8 x i16> %a) #0 {
 ; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
 ; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
 ; CHECK-NEXT:    retq
-  %res = call <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16> %a)
+  %res = call <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16> %a)
   ret <8 x i16> %res
 }
 
@@ -43,7 +43,7 @@ define <4 x i32> @reverse_v4i32(<4 x i32> %a) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
 ; CHECK-NEXT:    retq
-  %res = call <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32> %a)
+  %res = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> %a)
   ret <4 x i32> %res
 }
 
@@ -52,7 +52,7 @@ define <2 x i64> @reverse_v2i64(<2 x i64> %a) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; CHECK-NEXT:    retq
-  %res = call <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64> %a)
+  %res = call <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64> %a)
   ret <2 x i64> %res
 }
 
@@ -61,7 +61,7 @@ define <4 x float> @reverse_v4f32(<4 x float> %a) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
 ; CHECK-NEXT:    retq
-  %res = call <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float> %a)
+  %res = call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> %a)
   ret <4 x float> %res
 }
 
@@ -70,7 +70,7 @@ define <2 x double> @reverse_v2f64(<2 x double> %a) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; CHECK-NEXT:    retq
-  %res = call <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double> %a)
+  %res = call <2 x double> @llvm.vector.reverse.v2f64(<2 x double> %a)
   ret <2 x double> %res
 }
 
@@ -83,7 +83,7 @@ define <2 x i8> @reverse_v2i8(<2 x i8> %a) #0 {
 ; CHECK-NEXT:    psllw $8, %xmm0
 ; CHECK-NEXT:    por %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-  %res = call <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8> %a)
+  %res = call <2 x i8> @llvm.vector.reverse.v2i8(<2 x i8> %a)
   ret <2 x i8> %res
 }
 
@@ -95,7 +95,7 @@ define <8 x i32> @reverse_v8i32(<8 x i32> %a) #0 {
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,2,1,0]
 ; CHECK-NEXT:    movdqa %xmm2, %xmm0
 ; CHECK-NEXT:    retq
-  %res = call <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32> %a)
+  %res = call <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32> %a)
   ret <8 x i32> %res
 }
 
@@ -115,20 +115,20 @@ define <16 x float> @reverse_v16f32(<16 x float> %a) #0 {
 ; CHECK-NEXT:    movaps %xmm5, %xmm3
 ; CHECK-NEXT:    retq
 
-  %res = call <16 x float> @llvm.experimental.vector.reverse.v16f32(<16 x float> %a)
+  %res = call <16 x float> @llvm.vector.reverse.v16f32(<16 x float> %a)
   ret <16 x float> %res
 }
 
 
-declare <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8>)
-declare <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8>)
-declare <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16>)
-declare <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32>)
-declare <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32>)
-declare <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64>)
-declare <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half>)
-declare <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float>)
-declare <16 x float> @llvm.experimental.vector.reverse.v16f32(<16 x float>)
-declare <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double>)
+declare <2 x i8> @llvm.vector.reverse.v2i8(<2 x i8>)
+declare <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8>)
+declare <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16>)
+declare <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32>)
+declare <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32>)
+declare <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64>)
+declare <8 x half> @llvm.vector.reverse.v8f16(<8 x half>)
+declare <4 x float> @llvm.vector.reverse.v4f32(<4 x float>)
+declare <16 x float> @llvm.vector.reverse.v16f32(<16 x float>)
+declare <2 x double> @llvm.vector.reverse.v2f64(<2 x double>)
 
 attributes #0 = { nounwind }
diff --git a/llvm/test/MC/AMDGPU/ds-err.s b/llvm/test/MC/AMDGPU/ds-err.s
index 2d25fdf5e302..c31f4c759395 100644
--- a/llvm/test/MC/AMDGPU/ds-err.s
+++ b/llvm/test/MC/AMDGPU/ds-err.s
@@ -18,19 +18,19 @@ ds_write2_b32 v2, v4, v6 offset0:4 offset0:8
 ds_write2_b32 v2, v4, v6 offset1:4 offset1:8
 
 // offset0 too big
-// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid offset0 value.
 ds_write2_b32 v2, v4, v6 offset0:1000000000
 
 // offset0 too big
-// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid offset0 value.
 ds_write2_b32 v2, v4, v6 offset0:0x100
 
 // offset1 too big
-// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid offset1 value.
 ds_write2_b32 v2, v4, v6 offset1:1000000000
 
 // offset1 too big
-// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid offset1 value.
 ds_write2_b32 v2, v4, v6 offset1:0x100
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
index 3ec31626be5b..7f99afe01925 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
@@ -22,13 +22,13 @@ s_delay_alu instid0(VALU_DEP_1) | SALU_CYCLE_1)
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: expected a left parenthesis
 
 lds_direct_load v15 wait_vdst:16
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid wait_vdst value.
 
 lds_direct_load v15 wait_vdst
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 v_interp_p10_f32 v0, v1, v2, v3 wait_exp:8
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid wait_exp value.
 
 v_interp_p2_f32 v0, -v1, v2, v3 wait_exp
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s
index 31ed577ac0a2..a9dd290ea67d 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s
@@ -103,6 +103,6 @@ v_permlane16_var_b32 v5, v1, v2 op_sel:[0, 0, 1]
 // GFX12-NEXT:{{^}}                                ^
 
 v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:4
-// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid byte_sel value.
 // GFX12-NEXT:{{^}}v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:4
 // GFX12-NEXT:{{^}}                            ^
diff --git a/llvm/test/MC/AMDGPU/pal-msgpack.s b/llvm/test/MC/AMDGPU/pal-msgpack.s
index 886cc8b0538b..03c6c547af8a 100644
--- a/llvm/test/MC/AMDGPU/pal-msgpack.s
+++ b/llvm/test/MC/AMDGPU/pal-msgpack.s
@@ -14,10 +14,10 @@ amdpal.pipelines:
       - 0x123456789abcdef0
       - 0xfedcba9876543210
     .registers:      
-      0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0
-      0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x42000000
-      0xa1b3 (SPI_PS_INPUT_ENA): 0x1
-      0xa1b4 (SPI_PS_INPUT_ADDR): 0x1
+      '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0
+      '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x42000000
+      '0xa1b3 (SPI_PS_INPUT_ENA)': 0x1
+      '0xa1b4 (SPI_PS_INPUT_ADDR)': 0x1
 ...
 	.end_amdgpu_pal_metadata
 
@@ -34,10 +34,10 @@ amdpal.pipelines:
 // ASM:       - 0x123456789abcdef0
 // ASM:       - 0xfedcba9876543210
 // ASM:     .registers:      
-// ASM:       0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0
-// ASM:       0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x42000000
-// ASM:       0xa1b3 (SPI_PS_INPUT_ENA): 0x1
-// ASM:       0xa1b4 (SPI_PS_INPUT_ADDR): 0x1
+// ASM:       '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0
+// ASM:       '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x42000000
+// ASM:       '0xa1b3 (SPI_PS_INPUT_ENA)': 0x1
+// ASM:       '0xa1b4 (SPI_PS_INPUT_ADDR)': 0x1
 // ASM: ...
 // ASM: 	.end_amdgpu_pal_metadata
 
diff --git a/llvm/test/MC/RISCV/large-instructions.s b/llvm/test/MC/RISCV/large-instructions.s
new file mode 100644
index 000000000000..b50dbde17d38
--- /dev/null
+++ b/llvm/test/MC/RISCV/large-instructions.s
@@ -0,0 +1,29 @@
+# RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
+# RUN:     | llvm-objdump -d - | FileCheck %s
+
+# CHECK: 011f 4523 8967 <unknown>
+.byte 0x1f, 0x01, 0x23, 0x45, 0x67, 0x89
+
+# CHECK: 4523013f cdab8967 <unknown>
+.byte 0x3f, 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd
+
+# CHECK: 007f 4523 8967 cdab feef <unknown>
+.byte 0x7f, 0x00, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe
+
+# CHECK: 4523107f cdab8967 badcfeef <unknown>
+.byte 0x7f, 0x10, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba
+
+# CHECK: 207f 4523 8967 cdab feef badc 7698 <unknown>
+.byte 0x7f, 0x20, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76
+
+# CHECK: 4523307f cdab8967 badcfeef 32547698 <unknown>
+.byte 0x7f, 0x30, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32
+
+# CHECK: 407f 4523 8967 cdab feef badc 7698 3254 1210 <unknown>
+.byte 0x7f, 0x40, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, 0x12
+
+# CHECK: 4523507f cdab8967 badcfeef 32547698 56341210 <unknown>
+.byte 0x7f, 0x50, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, 0x12, 0x34, 0x56
+
+# CHECK: 607f 4523 8967 cdab feef badc 7698 3254 1210 5634 9a78 <unknown>
+.byte 0x7f, 0x60, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, 0x12, 0x34, 0x56, 0x78, 0x9a
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 51fb93daa4df..ebfed7b687e2 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -224,12 +224,14 @@
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Running analysis: ShouldNotRunFunctionPassesAnalysis
-; CHECK-O-NEXT: Running pass: CoroSplitPass
+; CHECK-DEFAULT-NEXT: Running pass: CoroSplitPass
+; CHECK-LTO-NOT: Running pass: CoroSplitPass
 ; CHECK-O-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: DeadArgumentEliminationPass
-; CHECK-O-NEXT: Running pass: CoroCleanupPass
+; CHECK-DEFAULT-NEXT: Running pass: CoroCleanupPass
+; CHECK-LTO-NOT: Running pass: CoroCleanupPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
 ; CHECK-DEFAULT-NEXT: Running pass: EliminateAvailableExternallyPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
index 6486639e07b4..e2fd74306f80 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
@@ -183,12 +183,10 @@
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Running analysis: ShouldNotRunFunctionPassesAnalysis
-; CHECK-O-NEXT: Running pass: CoroSplitPass
 ; CHECK-O-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: DeadArgumentEliminationPass
-; CHECK-O-NEXT: Running pass: CoroCleanupPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
 ; CHECK-EXT: Running pass: {{.*}}::Bye
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
index 09f9f0f48bad..13a63bbe4d9c 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -182,12 +182,10 @@
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Running analysis: ShouldNotRunFunctionPassesAnalysis
-; CHECK-O-NEXT: Running pass: CoroSplitPass
 ; CHECK-O-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: DeadArgumentEliminationPass
-; CHECK-O-NEXT: Running pass: CoroCleanupPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
 ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis on bar
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
index 47bdbfd2d357..3130da86fa99 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
@@ -147,12 +147,10 @@
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Running analysis: ShouldNotRunFunctionPassesAnalysis
-; CHECK-O-NEXT: Running pass: CoroSplitPass
 ; CHECK-O-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: DeadArgumentEliminationPass
-; CHECK-O-NEXT: Running pass: CoroCleanupPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
 ; CHECK-O-NEXT: Running pass: AnnotationRemarksPass on foo
diff --git a/llvm/test/TableGen/GlobalISelEmitter-frameindex.td b/llvm/test/TableGen/GlobalISelEmitter-frameindex.td
new file mode 100644
index 000000000000..232691465bb3
--- /dev/null
+++ b/llvm/test/TableGen/GlobalISelEmitter-frameindex.td
@@ -0,0 +1,29 @@
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common %s -o - < %s | FileCheck %s
+
+include "llvm/Target/Target.td"
+include "GlobalISelEmitterCommon.td"
+
+def ADD : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2), []>;
+
+//===- Test a simple pattern with frame index operands. ----------------------===//
+//
+// CHECK:       GIM_Try, /*On fail goto*//*Label [[LABEL_NUM:[0-9]+]]*/ GIMT_Encode4([[LABEL:[0-9]+]]),
+// CHECK-NEXT:    GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
+// CHECK-NEXT:    GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_FRAME_INDEX),
+// CHECK-NEXT:    // MIs[0] DstI[dst]
+// CHECK-NEXT:    GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_p0s32,
+// CHECK-NEXT:    GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// CHECK-NEXT:    // MIs[0] fi
+// CHECK-NEXT:    // No operand predicates
+// CHECK-NEXT:    // (frameindex:{ *:[i32] }):$fi  =>  (ADD:{ *:[i32] } (tframeindex:{ *:[i32] }):$fi, 0:{ *:[i32] })
+// CHECK-NEXT:    GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::ADD),
+// CHECK-NEXT:    GIR_RootToRootCopy, /*OpIdx*/0, // DstI[dst]
+// CHECK-NEXT:    GIR_RootToRootCopy, /*OpIdx*/1, // fi
+// CHECK-NEXT:    GIR_AddImm8, /*InsnID*/0, /*Imm*/0,
+// CHECK-NEXT:    GIR_RootConstrainSelectedInstOperands,
+// CHECK-NEXT:    // GIR_Coverage, 0,
+// CHECK-NEXT:    GIR_EraseRootFromParent_Done,
+// CHECK-NEXT:  // Label [[LABEL_NUM]]: @[[LABEL]]
+// CHECK-NEXT:  GIM_Reject,
+
+def : Pat<(p0 frameindex:$fi), (ADD tframeindex:$fi, 0)>;
diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/branch-on-zero.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/branch-on-zero.ll
index ff5cef7e781f..25dfb3c53a07 100644
--- a/llvm/test/Transforms/CodeGenPrepare/ARM/branch-on-zero.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/branch-on-zero.ll
@@ -211,6 +211,29 @@ else:
   ret i32 %l
 }
 
+define i32 @sub10_else_drop_nuw(i32 %a) {
+; CHECK-LABEL: @sub10_else_drop_nuw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L:%.*]] = sub i32 [[A:%.*]], 10
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[L]], 0
+; CHECK-NEXT:    br i1 [[TMP0]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    ret i32 0
+; CHECK:       else:
+; CHECK-NEXT:    ret i32 [[L]]
+;
+entry:
+  %c = icmp eq i32 %a, 10
+  br i1 %c, label %then, label %else
+
+then:
+  ret i32 0
+
+else:
+  %l = sub nuw i32 %a, 10
+  ret i32 %l
+}
+
 define i32 @subm10_then(i32 %a) {
 ; CHECK-LABEL: @subm10_then(
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/Transforms/CodeGenPrepare/RISCV/convert-to-eqz.ll b/llvm/test/Transforms/CodeGenPrepare/RISCV/convert-to-eqz.ll
new file mode 100644
index 000000000000..a6909d149134
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/RISCV/convert-to-eqz.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -codegenprepare -S -mtriple=riscv64 < %s | FileCheck %s
+
+define i8 @hoist_add(i8 %x) {
+; CHECK-LABEL: define i8 @hoist_add(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INC:%.*]] = add i8 [[X]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i8 [[INC]], 0
+; CHECK-NEXT:    br i1 [[TMP0]], label [[EXIT:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[INC]], [[IF_THEN]] ], [ -1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i8 [[RETVAL]]
+;
+entry:
+  %cmp = icmp eq i8 %x, -1
+  br i1 %cmp, label %exit, label %if.then
+
+if.then:
+  %inc = add nuw nsw i8 %x, 1
+  br label %exit
+
+exit:
+  %retval = phi i8 [ %inc, %if.then ], [ -1, %entry ]
+  ret i8 %retval
+}
+
+define i8 @hoist_lshr(i8 %x) {
+; CHECK-LABEL: define i8 @hoist_lshr(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INC:%.*]] = lshr i8 [[X]], 3
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i8 [[INC]], 0
+; CHECK-NEXT:    br i1 [[TMP0]], label [[EXIT:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[INC]], [[IF_THEN]] ], [ -1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i8 [[RETVAL]]
+;
+entry:
+  %cmp = icmp ult i8 %x, 8
+  br i1 %cmp, label %exit, label %if.then
+
+if.then:
+  %inc = lshr exact i8 %x, 3
+  br label %exit
+
+exit:
+  %retval = phi i8 [ %inc, %if.then ], [ -1, %entry ]
+  ret i8 %retval
+}
+
+define i8 @nomove_add(i8 %x) {
+; CHECK-LABEL: define i8 @nomove_add(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INC:%.*]] = add i8 [[X]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i8 [[INC]], 0
+; CHECK-NEXT:    br i1 [[TMP0]], label [[EXIT:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[INC]], [[IF_THEN]] ], [ -1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i8 [[RETVAL]]
+;
+entry:
+  %inc = add nuw nsw i8 %x, 1
+  %cmp = icmp eq i8 %x, -1
+  br i1 %cmp, label %exit, label %if.then
+
+if.then:
+  br label %exit
+
+exit:
+  %retval = phi i8 [ %inc, %if.then ], [ -1, %entry ]
+  ret i8 %retval
+}
diff --git a/llvm/test/Transforms/GlobalOpt/basictest.ll b/llvm/test/Transforms/GlobalOpt/basictest.ll
index 6d7fcdd96dfd..72d38a1e8845 100644
--- a/llvm/test/Transforms/GlobalOpt/basictest.ll
+++ b/llvm/test/Transforms/GlobalOpt/basictest.ll
@@ -1,9 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=globalopt -S | FileCheck %s
 
-; CHECK-NOT: global
 @X = internal global i32 4              ; <ptr> [#uses=1]
 
 define i32 @foo() {
-        %V = load i32, ptr @X               ; <i32> [#uses=1]
-        ret i32 %V
+; CHECK-LABEL: define i32 @foo() local_unnamed_addr {
+; CHECK-NEXT:    ret i32 4
+;
+  %V = load i32, ptr @X               ; <i32> [#uses=1]
+  ret i32 %V
+}
+
+@X_tls = internal thread_local global i32 13
+
+define i32 @bar() {
+; CHECK-LABEL: define i32 @bar() local_unnamed_addr {
+; CHECK-NEXT:    ret i32 13
+;
+  %p = call ptr @llvm.threadlocal.address(ptr @X_tls)
+  %v = load i32, ptr %p
+  ret i32 %v
 }
diff --git a/llvm/test/Transforms/GlobalOpt/constantfold-initializers.ll b/llvm/test/Transforms/GlobalOpt/constantfold-initializers.ll
index ca844f63937c..f82942e73d92 100644
--- a/llvm/test/Transforms/GlobalOpt/constantfold-initializers.ll
+++ b/llvm/test/Transforms/GlobalOpt/constantfold-initializers.ll
@@ -72,11 +72,12 @@ entry:
 }
 
 @threadlocalptr = global ptr null, align 4
-; CHECK: @threadlocalptr = global ptr null, align 4
+; CHECK: @threadlocalptr = local_unnamed_addr global ptr null, align 4
 @threadlocalvar = external thread_local global i32
 define internal void @test5() {
 entry:
-  store ptr @threadlocalvar, ptr @threadlocalptr, align 4
+  %p = call ptr @llvm.threadlocal.address(ptr @threadlocalvar)
+  store ptr %p, ptr @threadlocalptr, align 4
   ret void
 }
 
diff --git a/llvm/test/Transforms/GlobalOpt/stored-once-forward-value.ll b/llvm/test/Transforms/GlobalOpt/stored-once-forward-value.ll
index 7b845070bbd0..2b7ceb4169f3 100644
--- a/llvm/test/Transforms/GlobalOpt/stored-once-forward-value.ll
+++ b/llvm/test/Transforms/GlobalOpt/stored-once-forward-value.ll
@@ -39,12 +39,14 @@ define i32 @dom_arg(i32 %a) {
 
 define ptr @dom_thread_local_global() {
 ; CHECK-LABEL: @dom_thread_local_global(
-; CHECK-NEXT:    store ptr @tl, ptr @g3, align 8
+; CHECK-NEXT:    [[P:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @tl)
+; CHECK-NEXT:    store ptr [[P]], ptr @g3, align 8
 ; CHECK-NEXT:    call void @b()
 ; CHECK-NEXT:    [[R:%.*]] = load ptr, ptr @g3, align 8
 ; CHECK-NEXT:    ret ptr [[R]]
 ;
-  store ptr @tl, ptr @g3
+  %p = call ptr @llvm.threadlocal.address(ptr @tl)
+  store ptr %p, ptr @g3
   call void @b()
   %r = load ptr, ptr @g3
   ret ptr %r
diff --git a/llvm/test/Transforms/GlobalOpt/tls.ll b/llvm/test/Transforms/GlobalOpt/tls.ll
index 6ba003ff30b2..2cc2ea4e366e 100644
--- a/llvm/test/Transforms/GlobalOpt/tls.ll
+++ b/llvm/test/Transforms/GlobalOpt/tls.ll
@@ -15,14 +15,16 @@ declare void @start_thread(ptr)
 define i32 @f() {
 entry:
   ; Set @ip to point to x[1] for thread 1.
-  store ptr getelementptr inbounds ([100 x i32], ptr @x, i64 0, i64 1), ptr @ip, align 8
+  %p = call ptr @llvm.threadlocal.address(ptr @x)
+  %addr = getelementptr inbounds [100 x i32], ptr %p, i64 0, i64 1
+  store ptr %addr, ptr @ip, align 8
 
   ; Run g on a new thread.
   tail call void @start_thread(ptr @g) nounwind
   tail call void @wait() nounwind
 
   ; Reset x[1] for thread 1.
-  store i32 0, ptr getelementptr inbounds ([100 x i32], ptr @x, i64 0, i64 1), align 4
+  store i32 0, ptr %addr, align 4
 
   ; Read the value of @ip, which now points at x[1] for thread 2.
   %0 = load ptr, ptr @ip, align 8
@@ -39,10 +41,12 @@ entry:
 define internal void @g() nounwind uwtable {
 entry:
   ; Set @ip to point to x[1] for thread 2.
-  store ptr getelementptr inbounds ([100 x i32], ptr @x, i64 0, i64 1), ptr @ip, align 8
+  %p = call ptr @llvm.threadlocal.address(ptr @x)
+  %addr = getelementptr inbounds [100 x i32], ptr %p, i64 0, i64 1
+  store ptr %addr, ptr @ip, align 8
 
   ; Store 50 in x[1] for thread 2.
-  store i32 50, ptr getelementptr inbounds ([100 x i32], ptr @x, i64 0, i64 1), align 4
+  store i32 50, ptr %addr, align 4
 
   tail call void @signal() nounwind
   ret void
diff --git a/llvm/test/Transforms/InstCombine/array.ll b/llvm/test/Transforms/InstCombine/array.ll
index f439d4da6080..4f4ae17bebc5 100644
--- a/llvm/test/Transforms/InstCombine/array.ll
+++ b/llvm/test/Transforms/InstCombine/array.ll
@@ -116,8 +116,8 @@ define ptr @gep_inbounds_add_nsw_nonneg(ptr %ptr, i64 %a, i64 %b) {
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[A_NNEG]])
 ; CHECK-NEXT:    [[B_NNEG:%.*]] = icmp sgt i64 [[B]], -1
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[B_NNEG]])
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[A]]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[TMP1]], i64 [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[A]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[B]]
 ; CHECK-NEXT:    ret ptr [[GEP]]
 ;
   %a.nneg = icmp sgt i64 %a, -1
@@ -207,8 +207,8 @@ define ptr @gep_inbounds_sext_add_nonneg(ptr %ptr, i32 %a) {
 ; CHECK-NEXT:    [[A_NNEG:%.*]] = icmp sgt i32 [[A]], -1
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[A_NNEG]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[A]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP1]]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 40
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 40
 ; CHECK-NEXT:    ret ptr [[GEP]]
 ;
   %a.nneg = icmp sgt i32 %a, -1
diff --git a/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll b/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll
index 7f616bbb2a83..a61694919ab0 100644
--- a/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll
@@ -268,3 +268,328 @@ define i1 @icmp_trunc_x_zext_y_fail_multiuse(i32 %x, i8 %y) {
   %r = icmp ule i16 %x16, %y16
   ret i1 %r
 }
+
+define i1 @trunc_unsigned_nuw(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_unsigned_nuw(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nuw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i16 %x to i8
+  %yt = trunc nuw i16 %y to i8
+  %c = icmp ult i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_unsigned_nsw(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_unsigned_nsw(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i16 %x to i8
+  %yt = trunc nsw i16 %y to i8
+  %c = icmp ult i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_unsigned_both(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_unsigned_both(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw nsw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nuw nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw nsw i16 %x to i8
+  %yt = trunc nuw nsw i16 %y to i8
+  %c = icmp ult i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_unsigned_either(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_unsigned_either(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i16 %x to i8
+  %yt = trunc nsw i16 %y to i8
+  %c = icmp ult i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_signed_nuw(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_signed_nuw(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nuw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i16 %x to i8
+  %yt = trunc nuw i16 %y to i8
+  %c = icmp slt i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_signed_nsw(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_signed_nsw(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i16 %x to i8
+  %yt = trunc nsw i16 %y to i8
+  %c = icmp slt i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_signed_both(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_signed_both(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw nsw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nuw nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw nsw i16 %x to i8
+  %yt = trunc nuw nsw i16 %y to i8
+  %c = icmp slt i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_signed_either(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_signed_either(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i16 %x to i8
+  %yt = trunc nsw i16 %y to i8
+  %c = icmp slt i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_equality_nuw(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_equality_nuw(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nuw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i16 %x to i8
+  %yt = trunc nuw i16 %y to i8
+  %c = icmp eq i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_equality_nsw(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_equality_nsw(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i16 %x to i8
+  %yt = trunc nsw i16 %y to i8
+  %c = icmp eq i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_equality_both(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_equality_both(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw nsw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nuw nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw nsw i16 %x to i8
+  %yt = trunc nuw nsw i16 %y to i8
+  %c = icmp eq i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_equality_either(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_equality_either(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i16 %x to i8
+  %yt = trunc nsw i16 %y to i8
+  %c = icmp eq i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_unsigned_nuw_zext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_unsigned_nuw_zext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = zext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i32 %x to i16
+  %ye = zext i8 %y to i16
+  %c = icmp ult i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_unsigned_nuw_sext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_unsigned_nuw_sext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i32 %x to i16
+  %ye = sext i8 %y to i16
+  %c = icmp ult i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_unsigned_nsw_zext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_unsigned_nsw_zext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = zext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i32 %x to i16
+  %ye = zext i8 %y to i16
+  %c = icmp ult i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_unsigned_nsw_sext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_unsigned_nsw_sext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i32 %x to i16
+  %ye = sext i8 %y to i16
+  %c = icmp ult i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_signed_nsw_sext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_signed_nsw_sext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i32 %x to i16
+  %ye = sext i8 %y to i16
+  %c = icmp slt i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_signed_nsw_zext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_signed_nsw_zext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = zext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i32 %x to i16
+  %ye = zext i8 %y to i16
+  %c = icmp slt i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_signed_nuw_sext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_signed_nuw_sext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i32 %x to i16
+  %ye = sext i8 %y to i16
+  %c = icmp slt i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_signed_nuw_zext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_signed_nuw_zext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = zext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i32 %x to i16
+  %ye = zext i8 %y to i16
+  %c = icmp slt i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_equality_nuw_zext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_equality_nuw_zext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = zext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i32 %x to i16
+  %ye = zext i8 %y to i16
+  %c = icmp ne i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_equality_nuw_sext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_equality_nuw_sext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i32 %x to i16
+  %ye = sext i8 %y to i16
+  %c = icmp ne i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_equality_nsw_zext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_equality_nsw_zext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = zext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i32 %x to i16
+  %ye = zext i8 %y to i16
+  %c = icmp ne i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_equality_nsw_sext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_equality_nsw_sext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i32 %x to i16
+  %ye = sext i8 %y to i16
+  %c = icmp ne i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_equality_both_sext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_equality_both_sext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw nsw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw nsw i32 %x to i16
+  %ye = sext i8 %y to i16
+  %c = icmp ne i16 %xt, %ye
+  ret i1 %c
+}
diff --git a/llvm/test/Transforms/InstCombine/mul.ll b/llvm/test/Transforms/InstCombine/mul.ll
index 4c1ce10171dd..4fb3c0b1ad49 100644
--- a/llvm/test/Transforms/InstCombine/mul.ll
+++ b/llvm/test/Transforms/InstCombine/mul.ll
@@ -2146,7 +2146,7 @@ define i8 @mul_nsw_nonneg(i8 %x, i8 %y) {
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[X_NNEG]])
 ; CHECK-NEXT:    [[Y_NNEG:%.*]] = icmp sgt i8 [[Y:%.*]], -1
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[Y_NNEG]])
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i8 [[X]], [[Y]]
 ; CHECK-NEXT:    ret i8 [[MUL]]
 ;
   %x.nneg = icmp sge i8 %x, 0
diff --git a/llvm/test/Transforms/InstCombine/vector-reverse.ll b/llvm/test/Transforms/InstCombine/vector-reverse.ll
index 5e6672658f9a..a1a6ee949a13 100644
--- a/llvm/test/Transforms/InstCombine/vector-reverse.ll
+++ b/llvm/test/Transforms/InstCombine/vector-reverse.ll
@@ -8,11 +8,11 @@
 define <vscale x 4 x i32> @binop_reverse(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @binop_reverse(
 ; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]])
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %add = add nsw <vscale x 4 x i32> %a.rev, %b.rev
   ret <vscale x 4 x i32> %add
 }
@@ -20,14 +20,14 @@ define <vscale x 4 x i32> @binop_reverse(<vscale x 4 x i32> %a, <vscale x 4 x i3
 ; %a.rev has multiple uses
 define <vscale x 4 x i32> @binop_reverse_1(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @binop_reverse_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    [[ADD1:%.*]] = add <vscale x 4 x i32> [[A]], [[B:%.*]]
-; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]])
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   call void @use_nxv4i32(<vscale x 4 x i32>  %a.rev)
   %add = add <vscale x 4 x i32> %a.rev, %b.rev
   ret <vscale x 4 x i32> %add
@@ -36,14 +36,14 @@ define <vscale x 4 x i32> @binop_reverse_1(<vscale x 4 x i32> %a, <vscale x 4 x
 ; %b.rev has multiple uses
 define <vscale x 4 x i32> @binop_reverse_2(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @binop_reverse_2(
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    [[ADD1:%.*]] = add <vscale x 4 x i32> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]])
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   call void @use_nxv4i32(<vscale x 4 x i32>  %b.rev)
   %add = add <vscale x 4 x i32> %a.rev, %b.rev
   ret <vscale x 4 x i32> %add
@@ -52,15 +52,15 @@ define <vscale x 4 x i32> @binop_reverse_2(<vscale x 4 x i32> %a, <vscale x 4 x
 ; %a.rev and %b.rev have multiple uses
 define <vscale x 4 x i32> @binop_reverse_3(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @binop_reverse_3(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    [[ADD:%.*]] = add <vscale x 4 x i32> [[A_REV]], [[B_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   call void @use_nxv4i32(<vscale x 4 x i32> %a.rev)
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
   %add = add <vscale x 4 x i32> %a.rev, %b.rev
@@ -71,10 +71,10 @@ define <vscale x 4 x i32> @binop_reverse_3(<vscale x 4 x i32> %a, <vscale x 4 x
 define <vscale x 4 x i32> @binop_reverse_4(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: @binop_reverse_4(
 ; CHECK-NEXT:    [[MUL1:%.*]] = mul <vscale x 4 x i32> [[A:%.*]], [[A]]
-; CHECK-NEXT:    [[MUL:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[MUL1]])
+; CHECK-NEXT:    [[MUL:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[MUL1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[MUL]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %mul = mul <vscale x 4 x i32> %a.rev, %a.rev
   ret <vscale x 4 x i32> %mul
 }
@@ -82,12 +82,12 @@ define <vscale x 4 x i32> @binop_reverse_4(<vscale x 4 x i32> %a) {
 ; %a.rev used as both operands along with a third use
 define <vscale x 4 x i32> @binop_reverse_5(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: @binop_reverse_5(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    [[MUL:%.*]] = mul <vscale x 4 x i32> [[A_REV]], [[A_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[MUL]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   call void @use_nxv4i32(<vscale x 4 x i32> %a.rev)
   %mul = mul <vscale x 4 x i32> %a.rev, %a.rev
   ret <vscale x 4 x i32> %mul
@@ -98,10 +98,10 @@ define <vscale x 4 x i32> @binop_reverse_splat_RHS(<vscale x 4 x i32> %a, i32 %b
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[B_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DIV1:%.*]] = udiv <vscale x 4 x i32> [[A:%.*]], [[B_SPLAT]]
-; CHECK-NEXT:    [[DIV:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[DIV1]])
+; CHECK-NEXT:    [[DIV:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[DIV1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[DIV]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.insert = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %div = udiv <vscale x 4 x i32> %a.rev, %b.splat
@@ -111,14 +111,14 @@ define <vscale x 4 x i32> @binop_reverse_splat_RHS(<vscale x 4 x i32> %a, i32 %b
 ; %a.rev has multiple uses
 define <vscale x 4 x i32> @binop_reverse_splat_RHS_1(<vscale x 4 x i32> %a, i32 %b) {
 ; CHECK-LABEL: @binop_reverse_splat_RHS_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[B_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    [[DIV:%.*]] = udiv <vscale x 4 x i32> [[A_REV]], [[B_SPLAT]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[DIV]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.insert = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i32(<vscale x 4 x i32> %a.rev)
@@ -131,10 +131,10 @@ define <vscale x 4 x i32> @binop_reverse_splat_LHS(<vscale x 4 x i32> %a, i32 %b
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[B_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DIV1:%.*]] = udiv <vscale x 4 x i32> [[B_SPLAT]], [[A:%.*]]
-; CHECK-NEXT:    [[DIV:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[DIV1]])
+; CHECK-NEXT:    [[DIV:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[DIV1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[DIV]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.insert = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %div = udiv <vscale x 4 x i32> %b.splat, %a.rev
@@ -144,14 +144,14 @@ define <vscale x 4 x i32> @binop_reverse_splat_LHS(<vscale x 4 x i32> %a, i32 %b
 ; %a.rev has multiple uses
 define <vscale x 4 x i32> @binop_reverse_splat_LHS_1(<vscale x 4 x i32> %a, i32 %b) {
 ; CHECK-LABEL: @binop_reverse_splat_LHS_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[B_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    [[DIV:%.*]] = udiv <vscale x 4 x i32> [[B_SPLAT]], [[A_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[DIV]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.insert = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i32(<vscale x 4 x i32> %a.rev)
@@ -161,11 +161,11 @@ define <vscale x 4 x i32> @binop_reverse_splat_LHS_1(<vscale x 4 x i32> %a, i32
 
 define <vscale x 4 x float> @unop_reverse(<vscale x 4 x float> %a) {
 ; CHECK-LABEL: @unop_reverse(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[A:%.*]])
 ; CHECK-NEXT:    [[NEG:%.*]] = fneg fast <vscale x 4 x float> [[A_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[NEG]]
 ;
-  %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
+  %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
   %neg = fneg fast <vscale x 4 x float> %a.rev
   ret <vscale x 4 x float> %neg
 }
@@ -173,12 +173,12 @@ define <vscale x 4 x float> @unop_reverse(<vscale x 4 x float> %a) {
 ; %a.rev has multiple uses
 define <vscale x 4 x float> @unop_reverse_1(<vscale x 4 x float> %a) {
 ; CHECK-LABEL: @unop_reverse_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[A:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4f32(<vscale x 4 x float> [[A_REV]])
 ; CHECK-NEXT:    [[NEG:%.*]] = fneg fast <vscale x 4 x float> [[A_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[NEG]]
 ;
-  %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
+  %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
   call void @use_nxv4f32(<vscale x 4 x float> %a.rev)
   %neg = fneg fast <vscale x 4 x float> %a.rev
   ret <vscale x 4 x float> %neg
@@ -187,11 +187,11 @@ define <vscale x 4 x float> @unop_reverse_1(<vscale x 4 x float> %a) {
 define <vscale x 4 x i1> @icmp_reverse(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @icmp_reverse(
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq <vscale x 4 x i32> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
+; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %cmp = icmp eq <vscale x 4 x i32> %a.rev, %b.rev
   ret <vscale x 4 x i1> %cmp
 }
@@ -199,14 +199,14 @@ define <vscale x 4 x i1> @icmp_reverse(<vscale x 4 x i32> %a, <vscale x 4 x i32>
 ; %a.rev has multiple uses
 define <vscale x 4 x i1> @icmp_reverse_1(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @icmp_reverse_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq <vscale x 4 x i32> [[A]], [[B:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
+; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   call void @use_nxv4i32(<vscale x 4 x i32> %a.rev)
   %cmp = icmp eq <vscale x 4 x i32> %a.rev, %b.rev
   ret <vscale x 4 x i1> %cmp
@@ -215,14 +215,14 @@ define <vscale x 4 x i1> @icmp_reverse_1(<vscale x 4 x i32> %a, <vscale x 4 x i3
 ; %b.rev has multiple uses
 define <vscale x 4 x i1> @icmp_reverse_2(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @icmp_reverse_2(
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq <vscale x 4 x i32> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
+; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
   %cmp = icmp eq <vscale x 4 x i32> %a.rev, %b.rev
   ret <vscale x 4 x i1> %cmp
@@ -231,15 +231,15 @@ define <vscale x 4 x i1> @icmp_reverse_2(<vscale x 4 x i32> %a, <vscale x 4 x i3
 ; %a.rev and %b.rev have multiple uses
 define <vscale x 4 x i1> @icmp_reverse_3(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @icmp_reverse_3(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <vscale x 4 x i32> [[A_REV]], [[B_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   call void @use_nxv4i32(<vscale x 4 x i32> %a.rev)
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
   %cmp = icmp eq <vscale x 4 x i32> %a.rev, %b.rev
@@ -251,10 +251,10 @@ define <vscale x 4 x i1> @icmp_reverse_splat_RHS(<vscale x 4 x i32> %a, i32 %b)
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[B_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt <vscale x 4 x i32> [[B_SPLAT]], [[A:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
+; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.insert = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %cmp = icmp sgt <vscale x 4 x i32> %a.rev, %b.splat
@@ -264,14 +264,14 @@ define <vscale x 4 x i1> @icmp_reverse_splat_RHS(<vscale x 4 x i32> %a, i32 %b)
 ; %a.rev has multiple uses
 define <vscale x 4 x i1> @icmp_reverse_splat_RHS_1(<vscale x 4 x i32> %a, i32 %b) {
 ; CHECK-LABEL: @icmp_reverse_splat_RHS_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[B_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <vscale x 4 x i32> [[A_REV]], [[B_SPLAT]]
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.insert = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i32(<vscale x 4 x i32> %a.rev)
@@ -284,10 +284,10 @@ define <vscale x 4 x i1> @icmp_reverse_splat_LHS(<vscale x 4 x i32> %a, i32 %b)
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[B_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult <vscale x 4 x i32> [[B_SPLAT]], [[A:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
+; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.insert = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %cmp = icmp ult <vscale x 4 x i32> %b.splat, %a.rev
@@ -297,14 +297,14 @@ define <vscale x 4 x i1> @icmp_reverse_splat_LHS(<vscale x 4 x i32> %a, i32 %b)
 ; %a.rev has multiple uses
 define <vscale x 4 x i1> @icmp_reverse_splat_LHS_1(<vscale x 4 x i32> %a, i32 %b) {
 ; CHECK-LABEL: @icmp_reverse_splat_LHS_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[B_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult <vscale x 4 x i32> [[B_SPLAT]], [[A_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.insert = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i32(<vscale x 4 x i32> %a.rev)
@@ -315,12 +315,12 @@ define <vscale x 4 x i1> @icmp_reverse_splat_LHS_1(<vscale x 4 x i32> %a, i32 %b
 define <vscale x 4 x i32> @select_reverse(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
 ; CHECK-LABEL: @select_reverse(
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
-  %c.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %c.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %b.rev, <vscale x 4 x i32> %c.rev
   ret <vscale x 4 x i32> %select
 }
@@ -328,15 +328,15 @@ define <vscale x 4 x i32> @select_reverse(<vscale x 4 x i1> %a, <vscale x 4 x i3
 ; %a.rev has multiple uses
 define <vscale x 4 x i32> @select_reverse_1(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
 ; CHECK-LABEL: @select_reverse_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i1(<vscale x 4 x i1> [[A_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
-  %c.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %c.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
   call void @use_nxv4i1(<vscale x 4 x i1> %a.rev)
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %b.rev, <vscale x 4 x i32> %c.rev
   ret <vscale x 4 x i32> %select
@@ -345,15 +345,15 @@ define <vscale x 4 x i32> @select_reverse_1(<vscale x 4 x i1> %a, <vscale x 4 x
 ; %b.rev has multiple uses
 define <vscale x 4 x i32> @select_reverse_2(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
 ; CHECK-LABEL: @select_reverse_2(
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C:%.*]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
-  %c.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %c.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %b.rev, <vscale x 4 x i32> %c.rev
   ret <vscale x 4 x i32> %select
@@ -362,15 +362,15 @@ define <vscale x 4 x i32> @select_reverse_2(<vscale x 4 x i1> %a, <vscale x 4 x
 ; %c.rev has multiple uses
 define <vscale x 4 x i32> @select_reverse_3(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
 ; CHECK-LABEL: @select_reverse_3(
-; CHECK-NEXT:    [[C_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[C:%.*]])
+; CHECK-NEXT:    [[C_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[C:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[C_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
-  %c.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %c.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
   call void @use_nxv4i32(<vscale x 4 x i32> %c.rev)
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %b.rev, <vscale x 4 x i32> %c.rev
   ret <vscale x 4 x i32> %select
@@ -379,17 +379,17 @@ define <vscale x 4 x i32> @select_reverse_3(<vscale x 4 x i1> %a, <vscale x 4 x
 ; %a.rev and %b.rev have multiple uses
 define <vscale x 4 x i32> @select_reverse_4(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
 ; CHECK-LABEL: @select_reverse_4(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i1(<vscale x 4 x i1> [[A_REV]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C:%.*]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
-  %c.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %c.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
   call void @use_nxv4i1(<vscale x 4 x i1> %a.rev)
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %b.rev, <vscale x 4 x i32> %c.rev
@@ -399,17 +399,17 @@ define <vscale x 4 x i32> @select_reverse_4(<vscale x 4 x i1> %a, <vscale x 4 x
 ; %a.rev and %c.rev have multiple uses
 define <vscale x 4 x i32> @select_reverse_5(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
 ; CHECK-LABEL: @select_reverse_5(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
-; CHECK-NEXT:    [[C_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[C:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
+; CHECK-NEXT:    [[C_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[C:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i1(<vscale x 4 x i1> [[A_REV]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[C_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
-  %c.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %c.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
   call void @use_nxv4i1(<vscale x 4 x i1> %a.rev)
   call void @use_nxv4i32(<vscale x 4 x i32> %c.rev)
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %b.rev, <vscale x 4 x i32> %c.rev
@@ -419,17 +419,17 @@ define <vscale x 4 x i32> @select_reverse_5(<vscale x 4 x i1> %a, <vscale x 4 x
 ; %b.rev and %c.rev have multiple uses
 define <vscale x 4 x i32> @select_reverse_6(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
 ; CHECK-LABEL: @select_reverse_6(
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
-; CHECK-NEXT:    [[C_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[C:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[C_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[C:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[C_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
-  %c.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %c.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
   call void @use_nxv4i32(<vscale x 4 x i32> %c.rev)
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %b.rev, <vscale x 4 x i32> %c.rev
@@ -439,18 +439,18 @@ define <vscale x 4 x i32> @select_reverse_6(<vscale x 4 x i1> %a, <vscale x 4 x
 ; %a.rev, %b.rev and %c.rev have multiple uses
 define <vscale x 4 x i32> @select_reverse_7(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
 ; CHECK-LABEL: @select_reverse_7(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
-; CHECK-NEXT:    [[C_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[C:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[C_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[C:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i1(<vscale x 4 x i1> [[A_REV]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[C_REV]])
 ; CHECK-NEXT:    [[SELECT:%.*]] = select <vscale x 4 x i1> [[A_REV]], <vscale x 4 x i32> [[B_REV]], <vscale x 4 x i32> [[C_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
-  %c.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %c.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
   call void @use_nxv4i1(<vscale x 4 x i1> %a.rev)
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
   call void @use_nxv4i32(<vscale x 4 x i32> %c.rev)
@@ -463,11 +463,11 @@ define <vscale x 4 x i32> @select_reverse_splat_false(<vscale x 4 x i1> %a, <vsc
 ; CHECK-NEXT:    [[C_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[C_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[C_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C_SPLAT]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %c.insert = insertelement <vscale x 4 x i32> poison, i32 %c, i32 0
   %c.splat = shufflevector <vscale x 4 x i32> %c.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %b.rev, <vscale x 4 x i32> %c.splat
@@ -477,16 +477,16 @@ define <vscale x 4 x i32> @select_reverse_splat_false(<vscale x 4 x i1> %a, <vsc
 ; %a.rev has multiple uses
 define <vscale x 4 x i32> @select_reverse_splat_false_1(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, i32 %c) {
 ; CHECK-LABEL: @select_reverse_splat_false_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
 ; CHECK-NEXT:    [[C_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[C_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[C_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i1(<vscale x 4 x i1> [[A_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C_SPLAT]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %c.insert = insertelement <vscale x 4 x i32> poison, i32 %c, i32 0
   %c.splat = shufflevector <vscale x 4 x i32> %c.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i1(<vscale x 4 x i1> %a.rev)
@@ -497,16 +497,16 @@ define <vscale x 4 x i32> @select_reverse_splat_false_1(<vscale x 4 x i1> %a, <v
 ; %b.rev has multiple uses
 define <vscale x 4 x i32> @select_reverse_splat_false_2(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, i32 %c) {
 ; CHECK-LABEL: @select_reverse_splat_false_2(
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    [[C_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[C_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[C_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C_SPLAT]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %c.insert = insertelement <vscale x 4 x i32> poison, i32 %c, i32 0
   %c.splat = shufflevector <vscale x 4 x i32> %c.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
@@ -517,8 +517,8 @@ define <vscale x 4 x i32> @select_reverse_splat_false_2(<vscale x 4 x i1> %a, <v
 ; %a.rev and %b.rev have multiple uses
 define <vscale x 4 x i32> @select_reverse_splat_false_3(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, i32 %c) {
 ; CHECK-LABEL: @select_reverse_splat_false_3(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    [[C_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[C_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[C_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i1(<vscale x 4 x i1> [[A_REV]])
@@ -526,8 +526,8 @@ define <vscale x 4 x i32> @select_reverse_splat_false_3(<vscale x 4 x i1> %a, <v
 ; CHECK-NEXT:    [[SELECT:%.*]] = select <vscale x 4 x i1> [[A_REV]], <vscale x 4 x i32> [[B_REV]], <vscale x 4 x i32> [[C_SPLAT]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %c.insert = insertelement <vscale x 4 x i32> poison, i32 %c, i32 0
   %c.splat = shufflevector <vscale x 4 x i32> %c.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i1(<vscale x 4 x i1> %a.rev)
@@ -541,11 +541,11 @@ define <vscale x 4 x i32> @select_reverse_splat_true(<vscale x 4 x i1> %a, <vsca
 ; CHECK-NEXT:    [[C_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[C_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[C_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i32> [[C_SPLAT]], <vscale x 4 x i32> [[B:%.*]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %c.insert = insertelement <vscale x 4 x i32> poison, i32 %c, i32 0
   %c.splat = shufflevector <vscale x 4 x i32> %c.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %c.splat, <vscale x 4 x i32> %b.rev
@@ -555,16 +555,16 @@ define <vscale x 4 x i32> @select_reverse_splat_true(<vscale x 4 x i1> %a, <vsca
 ; %a.rev has multiple uses
 define <vscale x 4 x i32> @select_reverse_splat_true_1(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, i32 %c) {
 ; CHECK-LABEL: @select_reverse_splat_true_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
 ; CHECK-NEXT:    [[C_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[C_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[C_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i1(<vscale x 4 x i1> [[A_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A]], <vscale x 4 x i32> [[C_SPLAT]], <vscale x 4 x i32> [[B:%.*]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %c.insert = insertelement <vscale x 4 x i32> poison, i32 %c, i32 0
   %c.splat = shufflevector <vscale x 4 x i32> %c.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i1(<vscale x 4 x i1> %a.rev)
@@ -575,16 +575,16 @@ define <vscale x 4 x i32> @select_reverse_splat_true_1(<vscale x 4 x i1> %a, <vs
 ; %b.rev has multiple uses
 define <vscale x 4 x i32> @select_reverse_splat_true_2(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, i32 %c) {
 ; CHECK-LABEL: @select_reverse_splat_true_2(
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    [[C_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[C_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[C_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i32> [[C_SPLAT]], <vscale x 4 x i32> [[B]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %c.insert = insertelement <vscale x 4 x i32> poison, i32 %c, i32 0
   %c.splat = shufflevector <vscale x 4 x i32> %c.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
@@ -595,8 +595,8 @@ define <vscale x 4 x i32> @select_reverse_splat_true_2(<vscale x 4 x i1> %a, <vs
 ; %a.rev and %b.rev have multiple uses
 define <vscale x 4 x i32> @select_reverse_splat_true_3(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, i32 %c) {
 ; CHECK-LABEL: @select_reverse_splat_true_3(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    [[C_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[C_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[C_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i1(<vscale x 4 x i1> [[A_REV]])
@@ -604,8 +604,8 @@ define <vscale x 4 x i32> @select_reverse_splat_true_3(<vscale x 4 x i1> %a, <vs
 ; CHECK-NEXT:    [[SELECT:%.*]] = select <vscale x 4 x i1> [[A_REV]], <vscale x 4 x i32> [[C_SPLAT]], <vscale x 4 x i32> [[B_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %c.insert = insertelement <vscale x 4 x i32> poison, i32 %c, i32 0
   %c.splat = shufflevector <vscale x 4 x i32> %c.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i1(<vscale x 4 x i1> %a.rev)
@@ -622,10 +622,10 @@ define <vscale x 4 x float> @reverse_binop_reverse(<vscale x 4 x float> %a, <vsc
 ; CHECK-NEXT:    [[ADD1:%.*]] = fadd <vscale x 4 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[ADD1]]
 ;
-  %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
-  %b.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %b)
+  %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
+  %b.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %b)
   %add = fadd <vscale x 4 x float> %a.rev, %b.rev
-  %add.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %add)
+  %add.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %add)
   ret <vscale x 4 x float> %add.rev
 }
 
@@ -636,11 +636,11 @@ define <vscale x 4 x float> @reverse_binop_reverse_splat_RHS(<vscale x 4 x float
 ; CHECK-NEXT:    [[DIV1:%.*]] = fdiv <vscale x 4 x float> [[A:%.*]], [[B_SPLAT]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[DIV1]]
 ;
-  %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
+  %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
   %b.insert = insertelement <vscale x 4 x float> poison, float %b, i32 0
   %b.splat = shufflevector <vscale x 4 x float> %b.insert, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
   %div = fdiv <vscale x 4 x float> %a.rev, %b.splat
-  %div.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %div)
+  %div.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %div)
   ret <vscale x 4 x float> %div.rev
 }
 
@@ -651,11 +651,11 @@ define <vscale x 4 x float> @reverse_binop_reverse_splat_LHS(<vscale x 4 x float
 ; CHECK-NEXT:    [[DIV1:%.*]] = fdiv <vscale x 4 x float> [[B_SPLAT]], [[A:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[DIV1]]
 ;
-  %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
+  %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
   %b.insert = insertelement <vscale x 4 x float> poison, float %b, i32 0
   %b.splat = shufflevector <vscale x 4 x float> %b.insert, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
   %div = fdiv <vscale x 4 x float> %b.splat, %a.rev
-  %div.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %div)
+  %div.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %div)
   ret <vscale x 4 x float> %div.rev
 }
 
@@ -664,10 +664,10 @@ define <vscale x 4 x i1> @reverse_fcmp_reverse(<vscale x 4 x float> %a, <vscale
 ; CHECK-NEXT:    [[CMP1:%.*]] = fcmp fast olt <vscale x 4 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP1]]
 ;
-  %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
-  %b.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %b)
+  %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
+  %b.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %b)
   %cmp = fcmp fast olt <vscale x 4 x float> %a.rev, %b.rev
-  %cmp.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %cmp)
+  %cmp.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %cmp)
   ret <vscale x 4 x i1> %cmp.rev
 }
 
@@ -676,11 +676,11 @@ define <vscale x 4 x float> @reverse_select_reverse(<vscale x 4 x i1> %a, <vscal
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select fast <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x float> [[B:%.*]], <vscale x 4 x float> [[C:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[SELECT1]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %b)
-  %c.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %b)
+  %c.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %c)
   %select = select fast <vscale x 4 x i1> %a.rev, <vscale x 4 x float> %b.rev, <vscale x 4 x float> %c.rev
-  %select.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %select)
+  %select.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %select)
   ret <vscale x 4 x float> %select.rev
 }
 
@@ -689,9 +689,9 @@ define <vscale x 4 x float> @reverse_unop_reverse(<vscale x 4 x float> %a) {
 ; CHECK-NEXT:    [[NEG1:%.*]] = fneg <vscale x 4 x float> [[A:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[NEG1]]
 ;
-  %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
+  %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
   %neg = fneg <vscale x 4 x float> %a.rev
-  %neg.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %neg)
+  %neg.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %neg)
   ret <vscale x 4 x float> %neg.rev
 }
 
@@ -700,6 +700,6 @@ declare void @use_nxv4i1(<vscale x 4 x i1>)
 declare void @use_nxv4i32(<vscale x 4 x i32>)
 declare void @use_nxv4f32(<vscale x 4 x float>)
 
-declare <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32>)
-declare <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1>)
+declare <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float>)
diff --git a/llvm/test/Transforms/InstSimplify/named-vector-shuffle-reverse.ll b/llvm/test/Transforms/InstSimplify/named-vector-shuffle-reverse.ll
index a26f0a9d87f8..25e99ff0e715 100644
--- a/llvm/test/Transforms/InstSimplify/named-vector-shuffle-reverse.ll
+++ b/llvm/test/Transforms/InstSimplify/named-vector-shuffle-reverse.ll
@@ -6,8 +6,8 @@ define <vscale x 4 x i32> @shuffle_b2b_reverse(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: @shuffle_b2b_reverse(
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[A:%.*]]
 ;
-  %rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %rev.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %rev)
+  %rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %rev.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %rev)
   ret <vscale x 4 x i32> %rev.rev
 }
 
@@ -20,8 +20,8 @@ define <vscale x 4 x i32> @splat_reverse(i32 %a) {
 ;
   %splat_insert = insertelement <vscale x 4 x i32> poison, i32 %a, i32 0
   %splat = shufflevector <vscale x 4 x i32> %splat_insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-  %rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %splat)
+  %rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %splat)
   ret <vscale x 4 x i32> %rev
 }
 
-declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32>)
diff --git a/llvm/test/Transforms/InstSimplify/select.ll b/llvm/test/Transforms/InstSimplify/select.ll
index 40c1460e3ebc..4eb6491eec5a 100644
--- a/llvm/test/Transforms/InstSimplify/select.ll
+++ b/llvm/test/Transforms/InstSimplify/select.ll
@@ -1105,19 +1105,19 @@ define <2 x i32> @select_ctpop_zero_vec(<2 x i32> %x) {
 define <2 x i32> @select_vector_reverse(<2 x i32> %x) {
 ; CHECK-LABEL: @select_vector_reverse(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i32> [[X:%.*]], zeroinitializer
-; CHECK-NEXT:    [[REV:%.*]] = call <2 x i32> @llvm.experimental.vector.reverse.v2i32(<2 x i32> [[X]])
+; CHECK-NEXT:    [[REV:%.*]] = call <2 x i32> @llvm.vector.reverse.v2i32(<2 x i32> [[X]])
 ; CHECK-NEXT:    [[SEL:%.*]] = select <2 x i1> [[CMP]], <2 x i32> zeroinitializer, <2 x i32> [[REV]]
 ; CHECK-NEXT:    ret <2 x i32> [[SEL]]
 ;
   %cmp = icmp eq <2 x i32> %x, zeroinitializer
-  %rev = call <2 x i32> @llvm.experimental.vector.reverse.v2i32(<2 x i32> %x)
+  %rev = call <2 x i32> @llvm.vector.reverse.v2i32(<2 x i32> %x)
   %sel = select <2 x i1> %cmp, <2 x i32> zeroinitializer, <2 x i32> %rev
   ret <2 x i32> %sel
 }
 
 declare i32 @llvm.ctpop.i32(i32)
 declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>)
-declare <2 x i32> @llvm.experimental.vector.reverse.v2i32(<2 x i32>)
+declare <2 x i32> @llvm.vector.reverse.v2i32(<2 x i32>)
 
 define <2 x i32> @vec_select_no_equivalence(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @vec_select_no_equivalence(
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
index 224a0693bf21..54348d1e2a48 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
@@ -15,11 +15,11 @@ define { <16 x i8>, <16 x i8> } @deinterleave_i8_factor2(ptr %ptr) {
 ; SVE-FIXED-LABEL: define { <16 x i8>, <16 x i8> } @deinterleave_i8_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <32 x i8>, ptr [[PTR]], align 1
-; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> [[LOAD]])
+; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[LOAD]])
 ; SVE-FIXED-NEXT:    ret { <16 x i8>, <16 x i8> } [[DEINTERLEAVE]]
 ;
   %load = load <32 x i8>, ptr %ptr, align 1
-  %deinterleave = tail call { <16 x i8>, <16 x i8> } @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> %load)
+  %deinterleave = tail call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %load)
   ret { <16 x i8>, <16 x i8> } %deinterleave
 }
 
@@ -32,11 +32,11 @@ define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2(ptr %ptr) {
 ; SVE-FIXED-LABEL: define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <16 x i16>, ptr [[PTR]], align 2
-; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <8 x i16>, <8 x i16> } @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> [[LOAD]])
+; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <8 x i16>, <8 x i16> } @llvm.vector.deinterleave2.v16i16(<16 x i16> [[LOAD]])
 ; SVE-FIXED-NEXT:    ret { <8 x i16>, <8 x i16> } [[DEINTERLEAVE]]
 ;
   %load = load <16 x i16>, ptr %ptr, align 2
-  %deinterleave = tail call { <8 x i16>, <8 x i16> } @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %load)
+  %deinterleave = tail call { <8 x i16>, <8 x i16> } @llvm.vector.deinterleave2.v16i16(<16 x i16> %load)
   ret { <8 x i16>, <8 x i16> } %deinterleave
 }
 
@@ -49,11 +49,11 @@ define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2(ptr %ptr) {
 ; SVE-FIXED-LABEL: define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <8 x i32>, ptr [[PTR]], align 4
-; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <4 x i32>, <4 x i32> } @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> [[LOAD]])
+; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> [[LOAD]])
 ; SVE-FIXED-NEXT:    ret { <4 x i32>, <4 x i32> } [[DEINTERLEAVE]]
 ;
   %load = load <8 x i32>, ptr %ptr, align 4
-  %deinterleave = tail call { <4 x i32>, <4 x i32> } @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %load)
+  %deinterleave = tail call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %load)
   ret { <4 x i32>, <4 x i32> } %deinterleave
 }
 
@@ -66,11 +66,11 @@ define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2(ptr %ptr) {
 ; SVE-FIXED-LABEL: define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <4 x i64>, ptr [[PTR]], align 8
-; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x i64>, <2 x i64> } @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> [[LOAD]])
+; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x i64>, <2 x i64> } @llvm.vector.deinterleave2.v4i64(<4 x i64> [[LOAD]])
 ; SVE-FIXED-NEXT:    ret { <2 x i64>, <2 x i64> } [[DEINTERLEAVE]]
 ;
   %load = load <4 x i64>, ptr %ptr, align 8
-  %deinterleave = tail call { <2 x i64>, <2 x i64> } @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %load)
+  %deinterleave = tail call { <2 x i64>, <2 x i64> } @llvm.vector.deinterleave2.v4i64(<4 x i64> %load)
   ret { <2 x i64>, <2 x i64> } %deinterleave
 }
 
@@ -83,11 +83,11 @@ define { <4 x float>, <4 x float> } @deinterleave_float_factor2(ptr %ptr) {
 ; SVE-FIXED-LABEL: define { <4 x float>, <4 x float> } @deinterleave_float_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <8 x float>, ptr [[PTR]], align 4
-; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <4 x float>, <4 x float> } @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> [[LOAD]])
+; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <4 x float>, <4 x float> } @llvm.vector.deinterleave2.v8f32(<8 x float> [[LOAD]])
 ; SVE-FIXED-NEXT:    ret { <4 x float>, <4 x float> } [[DEINTERLEAVE]]
 ;
   %load = load <8 x float>, ptr %ptr, align 4
-  %deinterleave = tail call { <4 x float>, <4 x float> } @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %load)
+  %deinterleave = tail call { <4 x float>, <4 x float> } @llvm.vector.deinterleave2.v8f32(<8 x float> %load)
   ret { <4 x float>, <4 x float> } %deinterleave
 }
 
@@ -100,11 +100,11 @@ define { <2 x double>, <2 x double> } @deinterleave_double_factor2(ptr %ptr) {
 ; SVE-FIXED-LABEL: define { <2 x double>, <2 x double> } @deinterleave_double_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <4 x double>, ptr [[PTR]], align 8
-; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x double>, <2 x double> } @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> [[LOAD]])
+; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x double>, <2 x double> } @llvm.vector.deinterleave2.v4f64(<4 x double> [[LOAD]])
 ; SVE-FIXED-NEXT:    ret { <2 x double>, <2 x double> } [[DEINTERLEAVE]]
 ;
   %load = load <4 x double>, ptr %ptr, align 8
-  %deinterleave = tail call { <2 x double>, <2 x double> } @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %load)
+  %deinterleave = tail call { <2 x double>, <2 x double> } @llvm.vector.deinterleave2.v4f64(<4 x double> %load)
   ret { <2 x double>, <2 x double> } %deinterleave
 }
 
@@ -117,11 +117,11 @@ define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2(ptr %ptr) {
 ; SVE-FIXED-LABEL: define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <4 x ptr>, ptr [[PTR]], align 8
-; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x ptr>, <2 x ptr> } @llvm.experimental.vector.deinterleave2.v4p0(<4 x ptr> [[LOAD]])
+; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x ptr>, <2 x ptr> } @llvm.vector.deinterleave2.v4p0(<4 x ptr> [[LOAD]])
 ; SVE-FIXED-NEXT:    ret { <2 x ptr>, <2 x ptr> } [[DEINTERLEAVE]]
 ;
   %load = load <4 x ptr>, ptr %ptr, align 8
-  %deinterleave = tail call { <2 x ptr>, <2 x ptr> } @llvm.experimental.vector.deinterleave2.v4p0(<4 x ptr> %load)
+  %deinterleave = tail call { <2 x ptr>, <2 x ptr> } @llvm.vector.deinterleave2.v4p0(<4 x ptr> %load)
   ret { <2 x ptr>, <2 x ptr> } %deinterleave
 }
 
@@ -133,11 +133,11 @@ define void @interleave_i8_factor2(ptr %ptr, <16 x i8> %l, <16 x i8> %r) {
 ;
 ; SVE-FIXED-LABEL: define void @interleave_i8_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <16 x i8> [[L:%.*]], <16 x i8> [[R:%.*]]) #[[ATTR0]] {
-; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8> [[L]], <16 x i8> [[R]])
+; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <32 x i8> @llvm.vector.interleave2.v32i8(<16 x i8> [[L]], <16 x i8> [[R]])
 ; SVE-FIXED-NEXT:    store <32 x i8> [[INTERLEAVE]], ptr [[PTR]], align 1
 ; SVE-FIXED-NEXT:    ret void
 ;
-  %interleave = tail call <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8> %l, <16 x i8> %r)
+  %interleave = tail call <32 x i8> @llvm.vector.interleave2.v32i8(<16 x i8> %l, <16 x i8> %r)
   store <32 x i8> %interleave, ptr %ptr, align 1
   ret void
 }
@@ -150,11 +150,11 @@ define void @interleave_i16_factor2(ptr %ptr, <8 x i16> %l, <8 x i16> %r) {
 ;
 ; SVE-FIXED-LABEL: define void @interleave_i16_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <8 x i16> [[L:%.*]], <8 x i16> [[R:%.*]]) #[[ATTR0]] {
-; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> [[L]], <8 x i16> [[R]])
+; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16> [[L]], <8 x i16> [[R]])
 ; SVE-FIXED-NEXT:    store <16 x i16> [[INTERLEAVE]], ptr [[PTR]], align 2
 ; SVE-FIXED-NEXT:    ret void
 ;
-  %interleave = tail call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %l, <8 x i16> %r)
+  %interleave = tail call <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16> %l, <8 x i16> %r)
   store <16 x i16> %interleave, ptr %ptr, align 2
   ret void
 }
@@ -167,11 +167,11 @@ define void @interleave_i32_factor2(ptr %ptr, <4 x i32> %l, <4 x i32> %r) {
 ;
 ; SVE-FIXED-LABEL: define void @interleave_i32_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <4 x i32> [[L:%.*]], <4 x i32> [[R:%.*]]) #[[ATTR0]] {
-; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> [[L]], <4 x i32> [[R]])
+; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> [[L]], <4 x i32> [[R]])
 ; SVE-FIXED-NEXT:    store <8 x i32> [[INTERLEAVE]], ptr [[PTR]], align 4
 ; SVE-FIXED-NEXT:    ret void
 ;
-  %interleave = tail call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %l, <4 x i32> %r)
+  %interleave = tail call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %l, <4 x i32> %r)
   store <8 x i32> %interleave, ptr %ptr, align 4
   ret void
 }
@@ -184,11 +184,11 @@ define void @interleave_i64_factor2(ptr %ptr, <2 x i64> %l, <2 x i64> %r) {
 ;
 ; SVE-FIXED-LABEL: define void @interleave_i64_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <2 x i64> [[L:%.*]], <2 x i64> [[R:%.*]]) #[[ATTR0]] {
-; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> [[L]], <2 x i64> [[R]])
+; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64> [[L]], <2 x i64> [[R]])
 ; SVE-FIXED-NEXT:    store <4 x i64> [[INTERLEAVE]], ptr [[PTR]], align 8
 ; SVE-FIXED-NEXT:    ret void
 ;
-  %interleave = tail call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %l, <2 x i64> %r)
+  %interleave = tail call <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64> %l, <2 x i64> %r)
   store <4 x i64> %interleave, ptr %ptr, align 8
   ret void
 }
@@ -201,11 +201,11 @@ define void @interleave_float_factor2(ptr %ptr, <4 x float> %l, <4 x float> %r)
 ;
 ; SVE-FIXED-LABEL: define void @interleave_float_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <4 x float> [[L:%.*]], <4 x float> [[R:%.*]]) #[[ATTR0]] {
-; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> [[L]], <4 x float> [[R]])
+; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <8 x float> @llvm.vector.interleave2.v8f32(<4 x float> [[L]], <4 x float> [[R]])
 ; SVE-FIXED-NEXT:    store <8 x float> [[INTERLEAVE]], ptr [[PTR]], align 4
 ; SVE-FIXED-NEXT:    ret void
 ;
-  %interleave = tail call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %l, <4 x float> %r)
+  %interleave = tail call <8 x float> @llvm.vector.interleave2.v8f32(<4 x float> %l, <4 x float> %r)
   store <8 x float> %interleave, ptr %ptr, align 4
   ret void
 }
@@ -218,11 +218,11 @@ define void @interleave_double_factor2(ptr %ptr, <2 x double> %l, <2 x double> %
 ;
 ; SVE-FIXED-LABEL: define void @interleave_double_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <2 x double> [[L:%.*]], <2 x double> [[R:%.*]]) #[[ATTR0]] {
-; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> [[L]], <2 x double> [[R]])
+; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <4 x double> @llvm.vector.interleave2.v4f64(<2 x double> [[L]], <2 x double> [[R]])
 ; SVE-FIXED-NEXT:    store <4 x double> [[INTERLEAVE]], ptr [[PTR]], align 4
 ; SVE-FIXED-NEXT:    ret void
 ;
-  %interleave = tail call <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> %l, <2 x double> %r)
+  %interleave = tail call <4 x double> @llvm.vector.interleave2.v4f64(<2 x double> %l, <2 x double> %r)
   store <4 x double> %interleave, ptr %ptr, align 4
   ret void
 }
@@ -235,11 +235,11 @@ define void @interleave_ptr_factor2(ptr %ptr, <2 x ptr> %l, <2 x ptr> %r) {
 ;
 ; SVE-FIXED-LABEL: define void @interleave_ptr_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <2 x ptr> [[L:%.*]], <2 x ptr> [[R:%.*]]) #[[ATTR0]] {
-; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <4 x ptr> @llvm.experimental.vector.interleave2.v4p0(<2 x ptr> [[L]], <2 x ptr> [[R]])
+; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <4 x ptr> @llvm.vector.interleave2.v4p0(<2 x ptr> [[L]], <2 x ptr> [[R]])
 ; SVE-FIXED-NEXT:    store <4 x ptr> [[INTERLEAVE]], ptr [[PTR]], align 4
 ; SVE-FIXED-NEXT:    ret void
 ;
-  %interleave = tail call <4 x ptr> @llvm.experimental.vector.interleave2.v4p0(<2 x ptr> %l, <2 x ptr> %r)
+  %interleave = tail call <4 x ptr> @llvm.vector.interleave2.v4p0(<2 x ptr> %l, <2 x ptr> %r)
   store <4 x ptr> %interleave, ptr %ptr, align 4
   ret void
 }
@@ -266,11 +266,11 @@ define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2(ptr %ptr) #0 {
 ; SVE-FIXED-LABEL: define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <32 x i16>, ptr [[PTR]], align 2
-; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <16 x i16>, <16 x i16> } @llvm.experimental.vector.deinterleave2.v32i16(<32 x i16> [[LOAD]])
+; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <16 x i16>, <16 x i16> } @llvm.vector.deinterleave2.v32i16(<32 x i16> [[LOAD]])
 ; SVE-FIXED-NEXT:    ret { <16 x i16>, <16 x i16> } [[DEINTERLEAVE]]
 ;
   %load = load <32 x i16>, ptr %ptr, align 2
-  %deinterleave = tail call { <16 x i16>, <16 x i16> } @llvm.experimental.vector.deinterleave2.v32i16(<32 x i16> %load)
+  %deinterleave = tail call { <16 x i16>, <16 x i16> } @llvm.vector.deinterleave2.v32i16(<32 x i16> %load)
   ret { <16 x i16>, <16 x i16> } %deinterleave
 }
 
@@ -297,29 +297,29 @@ define void @interleave_wide_ptr_factor2(ptr %ptr, <8 x ptr> %l, <8 x ptr> %r) {
 ;
 ; SVE-FIXED-LABEL: define void @interleave_wide_ptr_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <8 x ptr> [[L:%.*]], <8 x ptr> [[R:%.*]]) #[[ATTR0]] {
-; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <16 x ptr> @llvm.experimental.vector.interleave2.v16p0(<8 x ptr> [[L]], <8 x ptr> [[R]])
+; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <16 x ptr> @llvm.vector.interleave2.v16p0(<8 x ptr> [[L]], <8 x ptr> [[R]])
 ; SVE-FIXED-NEXT:    store <16 x ptr> [[INTERLEAVE]], ptr [[PTR]], align 4
 ; SVE-FIXED-NEXT:    ret void
 ;
-  %interleave = tail call <16 x ptr> @llvm.experimental.vector.interleave2.v16p0(<8 x ptr> %l, <8 x ptr> %r)
+  %interleave = tail call <16 x ptr> @llvm.vector.interleave2.v16p0(<8 x ptr> %l, <8 x ptr> %r)
   store <16 x ptr> %interleave, ptr %ptr, align 4
   ret void
 }
 
-declare { <16 x i8>, <16 x i8> } @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8>)
-declare { <8 x i16>, <8 x i16> } @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16>)
-declare { <4 x i32>, <4 x i32> } @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>)
-declare { <2 x i64>, <2 x i64> } @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64>)
-declare { <4 x float>, <4 x float> } @llvm.experimental.vector.deinterleave2.v8f32(<8 x float>)
-declare { <2 x double>, <2 x double> } @llvm.experimental.vector.deinterleave2.v4f64(<4 x double>)
-declare { <2 x ptr>, <2 x ptr> } @llvm.experimental.vector.deinterleave2.v4p0(<4 x ptr>)
-declare { <16 x i16>, <16 x i16> } @llvm.experimental.vector.deinterleave2.v32i16(<32 x i16>)
+declare { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8>)
+declare { <8 x i16>, <8 x i16> } @llvm.vector.deinterleave2.v16i16(<16 x i16>)
+declare { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32>)
+declare { <2 x i64>, <2 x i64> } @llvm.vector.deinterleave2.v4i64(<4 x i64>)
+declare { <4 x float>, <4 x float> } @llvm.vector.deinterleave2.v8f32(<8 x float>)
+declare { <2 x double>, <2 x double> } @llvm.vector.deinterleave2.v4f64(<4 x double>)
+declare { <2 x ptr>, <2 x ptr> } @llvm.vector.deinterleave2.v4p0(<4 x ptr>)
+declare { <16 x i16>, <16 x i16> } @llvm.vector.deinterleave2.v32i16(<32 x i16>)
 
-declare <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8>, <16 x i8>)
-declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
-declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
-declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
-declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>)
-declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>)
-declare <4 x ptr> @llvm.experimental.vector.interleave2.v4p0(<2 x ptr>, <2 x ptr>)
-declare <16 x ptr> @llvm.experimental.vector.interleave2.v16p0(<8 x ptr>, <8 x ptr>)
+declare <32 x i8> @llvm.vector.interleave2.v32i8(<16 x i8>, <16 x i8>)
+declare <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
+declare <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
+declare <8 x float> @llvm.vector.interleave2.v8f32(<4 x float>, <4 x float>)
+declare <4 x double> @llvm.vector.interleave2.v4f64(<2 x double>, <2 x double>)
+declare <4 x ptr> @llvm.vector.interleave2.v4p0(<2 x ptr>, <2 x ptr>)
+declare <16 x ptr> @llvm.vector.interleave2.v16p0(<8 x ptr>, <8 x ptr>)
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
index 6353bf10d57c..2a05718cc416 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
@@ -11,7 +11,7 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8> } @deinterleave_nxi8_factor2(ptr
 ; CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]]
 ;
   %load = load <vscale x 32 x i8>, ptr %ptr, align 1
-  %deinterleave = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %load)
+  %deinterleave = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %load)
   ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleave
 }
 
@@ -22,7 +22,7 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16> } @deinterleave_nxi16_factor2(pt
 ; CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]]
 ;
   %load = load <vscale x 16 x i16>, ptr %ptr, align 2
-  %deinterleave = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %load)
+  %deinterleave = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %load)
   ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %deinterleave
 }
 
@@ -33,7 +33,7 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32> } @deinterleave_nx8xi32_factor2(
 ; CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]]
 ;
   %load = load <vscale x 8 x i32>, ptr %ptr, align 4
-  %deinterleave = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %load)
+  %deinterleave = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %load)
   ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave
 }
 
@@ -44,7 +44,7 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64> } @deinterleave_nxi64_factor2(pt
 ; CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]]
 ;
   %load = load <vscale x 4 x i64>, ptr %ptr, align 8
-  %deinterleave = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %load)
+  %deinterleave = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %load)
   ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %deinterleave
 }
 
@@ -55,7 +55,7 @@ define { <vscale x 4 x float>, <vscale x 4 x float> } @deinterleave_nxfloat_fact
 ; CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]]
 ;
   %load = load <vscale x 8 x float>, ptr %ptr, align 4
-  %deinterleave = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %load)
+  %deinterleave = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %load)
   ret { <vscale x 4 x float>, <vscale x 4 x float> } %deinterleave
 }
 
@@ -66,7 +66,7 @@ define { <vscale x 2 x double>, <vscale x 2 x double> } @deinterleave_nxdouble_f
 ; CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]]
 ;
   %load = load <vscale x 4 x double>, ptr %ptr, align 8
-  %deinterleave = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %load)
+  %deinterleave = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %load)
   ret { <vscale x 2 x double>, <vscale x 2 x double> } %deinterleave
 }
 
@@ -77,7 +77,7 @@ define { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @deinterleave_nxptr_factor2(pt
 ; CHECK-NEXT:    ret { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[LDN]]
 ;
   %load = load <vscale x 4 x ptr>, ptr %ptr, align 8
-  %deinterleave = tail call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.experimental.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr> %load)
+  %deinterleave = tail call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr> %load)
   ret { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %deinterleave
 }
 
@@ -87,7 +87,7 @@ define void @interleave_nxi8_factor2(ptr %ptr, <vscale x 16 x i8> %l, <vscale x
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[L]], <vscale x 16 x i8> [[R]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> %l, <vscale x 16 x i8> %r)
+  %interleave = tail call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %l, <vscale x 16 x i8> %r)
   store <vscale x 32 x i8> %interleave, ptr %ptr, align 1
   ret void
 }
@@ -98,7 +98,7 @@ define void @interleave_nxi16_factor2(ptr %ptr, <vscale x 8 x i16> %l, <vscale x
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[L]], <vscale x 8 x i16> [[R]], <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %l, <vscale x 8 x i16> %r)
+  %interleave = tail call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %l, <vscale x 8 x i16> %r)
   store <vscale x 16 x i16> %interleave, ptr %ptr, align 2
   ret void
 }
@@ -109,7 +109,7 @@ define void @interleave_nxi32_factor2(ptr %ptr, <vscale x 4 x i32> %l, <vscale x
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[L]], <vscale x 4 x i32> [[R]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %l, <vscale x 4 x i32> %r)
+  %interleave = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %l, <vscale x 4 x i32> %r)
   store <vscale x 8 x i32> %interleave, ptr %ptr, align 4
   ret void
 }
@@ -120,7 +120,7 @@ define void @interleave_nxi64_factor2(ptr %ptr, <vscale x 2 x i64> %l, <vscale x
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[L]], <vscale x 2 x i64> [[R]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64> %l, <vscale x 2 x i64> %r)
+  %interleave = tail call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %l, <vscale x 2 x i64> %r)
   store <vscale x 4 x i64> %interleave, ptr %ptr, align 8
   ret void
 }
@@ -131,7 +131,7 @@ define void @interleave_nxfloat_factor2(ptr %ptr, <vscale x 4 x float> %l, <vsca
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[L]], <vscale x 4 x float> [[R]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float> %l, <vscale x 4 x float> %r)
+  %interleave = tail call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %l, <vscale x 4 x float> %r)
   store <vscale x 8 x float> %interleave, ptr %ptr, align 4
   ret void
 }
@@ -142,7 +142,7 @@ define void @interleave_nxdouble_factor2(ptr %ptr, <vscale x 2 x double> %l, <vs
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[L]], <vscale x 2 x double> [[R]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %l, <vscale x 2 x double> %r)
+  %interleave = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %l, <vscale x 2 x double> %r)
   store <vscale x 4 x double> %interleave, ptr %ptr, align 4
   ret void
 }
@@ -153,7 +153,7 @@ define void @interleave_nxptr_factor2(ptr %ptr, <vscale x 2 x ptr> %l, <vscale x
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv2p0(<vscale x 2 x ptr> [[L]], <vscale x 2 x ptr> [[R]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 4 x ptr> @llvm.experimental.vector.interleave2.nxv4p0(<vscale x 2 x ptr> %l, <vscale x 2 x ptr> %r)
+  %interleave = tail call <vscale x 4 x ptr> @llvm.vector.interleave2.nxv4p0(<vscale x 2 x ptr> %l, <vscale x 2 x ptr> %r)
   store <vscale x 4 x ptr> %interleave, ptr %ptr, align 4
   ret void
 }
@@ -192,7 +192,7 @@ define { <vscale x 16 x i32>, <vscale x 16 x i32> } @deinterleave_wide_nxi32_fac
 ; CHECK-NEXT:    ret { <vscale x 16 x i32>, <vscale x 16 x i32> } [[TMP22]]
 ;
   %load = load <vscale x 32 x i32>, ptr %ptr, align 4
-  %deinterleave = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.experimental.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %load)
+  %deinterleave = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %load)
   ret { <vscale x 16 x i32>, <vscale x 16 x i32> } %deinterleave
 }
 
@@ -216,7 +216,7 @@ define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_wide_nxdou
 ; CHECK-NEXT:    ret { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]]
 ;
   %load = load <vscale x 8 x double>, ptr %ptr, align 8
-  %deinterleave = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %load)
+  %deinterleave = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %load)
   ret { <vscale x 4 x double>, <vscale x 4 x double> } %deinterleave
 }
 
@@ -233,32 +233,32 @@ define void @interleave_wide_nxdouble_factor2(ptr %ptr, <vscale x 4 x double> %l
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP4]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 8 x double> @llvm.experimental.vector.interleave2.nxv8f64(<vscale x 4 x double> %l, <vscale x 4 x double> %r)
+  %interleave = tail call <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double> %l, <vscale x 4 x double> %r)
   store <vscale x 8 x double> %interleave, ptr %ptr, align 4
   ret void
 }
 
-declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
-declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
-declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
-declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
-declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
-declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
-declare { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.experimental.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr>)
+declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr>)
 
 ; Larger deinterleaves to test 'legalization'
-declare { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.experimental.vector.deinterleave2.nxv32i32(<vscale x 32 x i32>)
-declare { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double>)
+declare { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32>)
+declare { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double>)
 
-declare <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
-declare <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
-declare <vscale x 4 x ptr> @llvm.experimental.vector.interleave2.nxv4p0(<vscale x 2 x ptr>, <vscale x 2 x ptr>)
+declare <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x ptr> @llvm.vector.interleave2.nxv4p0(<vscale x 2 x ptr>, <vscale x 2 x ptr>)
 
 ; Larger interleaves to test 'legalization'
-declare <vscale x 8 x double> @llvm.experimental.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)
+declare <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)
 
 attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
index 45e2c36836ff..73f26814f3a4 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
@@ -511,7 +511,7 @@ define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_nxptr_fact
 ; CHECK-NEXT:    ret { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]]
 ;
   %wide.vec = load <vscale x 8 x double>, ptr %ptr, align 8
-  %ldN = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %wide.vec)
+  %ldN = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %wide.vec)
   ret { <vscale x 4 x double>, <vscale x 4 x double> } %ldN
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index a78696cadaaf..14b5ee244080 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -763,9 +763,9 @@ define void @latch_branch_cost(ptr %dst) {
 ; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PRED:       vector.body:
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
-; PRED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
-; PRED-NEXT:    [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], <i64 99, i64 99, i64 99, i64 99>
-; PRED-NEXT:    [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
+; PRED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
+; PRED-NEXT:    [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], <i64 99, i64 99, i64 99, i64 99, i64 99, i64 99, i64 99, i64 99>
+; PRED-NEXT:    [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
 ; PRED-NEXT:    br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; PRED:       pred.store.if:
 ; PRED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
@@ -773,7 +773,7 @@ define void @latch_branch_cost(ptr %dst) {
 ; PRED-NEXT:    store i8 0, ptr [[TMP3]], align 1
 ; PRED-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; PRED:       pred.store.continue:
-; PRED-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
+; PRED-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1
 ; PRED-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
 ; PRED:       pred.store.if1:
 ; PRED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 1
@@ -781,7 +781,7 @@ define void @latch_branch_cost(ptr %dst) {
 ; PRED-NEXT:    store i8 0, ptr [[TMP6]], align 1
 ; PRED-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; PRED:       pred.store.continue2:
-; PRED-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
+; PRED-NEXT:    [[TMP7:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2
 ; PRED-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
 ; PRED:       pred.store.if3:
 ; PRED-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 2
@@ -789,28 +789,60 @@ define void @latch_branch_cost(ptr %dst) {
 ; PRED-NEXT:    store i8 0, ptr [[TMP9]], align 1
 ; PRED-NEXT:    br label [[PRED_STORE_CONTINUE4]]
 ; PRED:       pred.store.continue4:
-; PRED-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
-; PRED-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
+; PRED-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3
+; PRED-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
 ; PRED:       pred.store.if5:
 ; PRED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 3
 ; PRED-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
 ; PRED-NEXT:    store i8 0, ptr [[TMP12]], align 1
-; PRED-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE7]]
 ; PRED:       pred.store.continue6:
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; PRED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; PRED-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; PRED-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; PRED-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4
+; PRED-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
+; PRED:       pred.store.if7:
+; PRED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 4
+; PRED-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP14]]
+; PRED-NEXT:    store i8 0, ptr [[TMP15]], align 1
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; PRED:       pred.store.continue8:
+; PRED-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5
+; PRED-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; PRED:       pred.store.if9:
+; PRED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 5
+; PRED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
+; PRED-NEXT:    store i8 0, ptr [[TMP18]], align 1
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE10]]
+; PRED:       pred.store.continue10:
+; PRED-NEXT:    [[TMP19:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6
+; PRED-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; PRED:       pred.store.if11:
+; PRED-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 6
+; PRED-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP20]]
+; PRED-NEXT:    store i8 0, ptr [[TMP21]], align 1
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE12]]
+; PRED:       pred.store.continue12:
+; PRED-NEXT:    [[TMP22:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7
+; PRED-NEXT:    br i1 [[TMP22]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE6]]
+; PRED:       pred.store.if13:
+; PRED-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 7
+; PRED-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP23]]
+; PRED-NEXT:    store i8 0, ptr [[TMP24]], align 1
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; PRED:       pred.store.continue14:
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; PRED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+; PRED-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 104
+; PRED-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; PRED:       middle.block:
 ; PRED-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; PRED:       scalar.ph:
-; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 104, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; PRED-NEXT:    br label [[FOR_BODY:%.*]]
 ; PRED:       loop:
-; PRED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; PRED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDVARS_IV]]
-; PRED-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
-; PRED-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; PRED-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; PRED-NEXT:    store i8 0, ptr [[GEP]], align 1
+; PRED-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[IV]], 1
 ; PRED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
 ; PRED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; PRED:       exit:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
index 2be525a2abc0..2cc0aa2ffca5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
@@ -22,8 +22,8 @@ define i32 @PR33613(ptr %b, double %j, i32 %d) #0 {
 ; CHECK-VF4UF2-LABEL: @PR33613
 ; CHECK-VF4UF2: vector.body
 ; CHECK-VF4UF2: %[[VEC_RECUR:.*]] = phi <vscale x 4 x double> [ {{.*}}, %vector.ph ], [ {{.*}}, %vector.body ]
-; CHECK-VF4UF2: %[[SPLICE1:.*]] = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> %[[VEC_RECUR]], <vscale x 4 x double> {{.*}}, i32 -1)
-; CHECK-VF4UF2-NEXT: %[[SPLICE2:.*]] = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> %{{.*}}, <vscale x 4 x double> %{{.*}}, i32 -1)
+; CHECK-VF4UF2: %[[SPLICE1:.*]] = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> %[[VEC_RECUR]], <vscale x 4 x double> {{.*}}, i32 -1)
+; CHECK-VF4UF2-NEXT: %[[SPLICE2:.*]] = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> %{{.*}}, <vscale x 4 x double> %{{.*}}, i32 -1)
 ; CHECK-VF4UF2-NOT: insertelement <vscale x 4 x double>
 ; CHECK-VF4UF2: middle.block
 entry:
@@ -71,7 +71,7 @@ define void @PR34711(ptr %a, ptr %b, ptr %c, i64 %n) #0 {
 ; CHECK-VF4UF1: vector.body
 ; CHECK-VF4UF1: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i16> [ %vector.recur.init, %vector.ph ], [ %[[MGATHER:.*]], %vector.body ]
 ; CHECK-VF4UF1: %[[MGATHER]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> {{.*}}, i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison)
-; CHECK-VF4UF1-NEXT: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> %[[VEC_RECUR]], <vscale x 4 x i16> %[[MGATHER]], i32 -1)
+; CHECK-VF4UF1-NEXT: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %[[VEC_RECUR]], <vscale x 4 x i16> %[[MGATHER]], i32 -1)
 ; CHECK-VF4UF1-NEXT: %[[SXT1:.*]] = sext <vscale x 4 x i16> %[[SPLICE]] to <vscale x 4 x i32>
 ; CHECK-VF4UF1-NEXT: %[[SXT2:.*]] = sext <vscale x 4 x i16> %[[MGATHER]] to <vscale x 4 x i32>
 ; CHECK-VF4UF1-NEXT: mul nsw <vscale x 4 x i32> %[[SXT2]], %[[SXT1]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index af5decb0d340..c85ae6dba73e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -856,16 +856,67 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) {
 ; PRED-LABEL: define void @exit_cond_zext_iv(
 ; PRED-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
 ; PRED-NEXT:  entry:
+; PRED-NEXT:    [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; PRED:       vector.scevcheck:
+; PRED-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[UMAX]], -1
+; PRED-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
+; PRED-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP0]] to i32
+; PRED-NEXT:    [[TMP3:%.*]] = add i32 1, [[TMP2]]
+; PRED-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 1
+; PRED-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
+; PRED-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
+; PRED-NEXT:    br i1 [[TMP6]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX1]], 1
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
+; PRED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX1]], 1
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT3]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; PRED-NEXT:    br label [[LOOP:%.*]]
-; PRED:       loop:
-; PRED-NEXT:    [[IV_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
-; PRED-NEXT:    [[IV_CONV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_EXT:%.*]], [[LOOP]] ]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[INDEX]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; PRED-NEXT:    [[VEC_IV:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1>
+; PRED-NEXT:    [[TMP7:%.*]] = icmp ule <2 x i64> [[VEC_IV]], [[BROADCAST_SPLAT4]]
+; PRED-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; PRED-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; PRED:       pred.store.if:
+; PRED-NEXT:    [[IV_CONV:%.*]] = add i64 [[INDEX]], 0
 ; PRED-NEXT:    [[GEP:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[IV_CONV]], i32 2
 ; PRED-NEXT:    store i32 0, ptr [[GEP]], align 8
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; PRED:       pred.store.continue:
+; PRED-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; PRED-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
+; PRED:       pred.store.if5:
+; PRED-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 1
+; PRED-NEXT:    [[TMP13:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[TMP12]], i32 2
+; PRED-NEXT:    store i32 0, ptr [[TMP13]], align 8
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; PRED:       pred.store.continue6:
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; PRED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; PRED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; PRED-NEXT:    br label [[LOOP1:%.*]]
+; PRED:       loop:
+; PRED-NEXT:    [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP1]] ]
+; PRED-NEXT:    [[IV_CONV1:%.*]] = phi i64 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[IV_EXT:%.*]], [[LOOP1]] ]
+; PRED-NEXT:    [[GEP1:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[IV_CONV1]], i32 2
+; PRED-NEXT:    store i32 0, ptr [[GEP1]], align 8
 ; PRED-NEXT:    [[IV_1_NEXT]] = add i32 [[IV_1]], 1
 ; PRED-NEXT:    [[IV_EXT]] = zext i32 [[IV_1_NEXT]] to i64
 ; PRED-NEXT:    [[C:%.*]] = icmp ult i64 [[IV_EXT]], [[N]]
-; PRED-NEXT:    br i1 [[C]], label [[LOOP]], label [[EXIT:%.*]]
+; PRED-NEXT:    br i1 [[C]], label [[LOOP1]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
 ; PRED:       exit:
 ; PRED-NEXT:    ret void
 ;
@@ -913,4 +964,6 @@ attributes #0 = { "target-features"="+sve" }
 ; PRED: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
 ; PRED: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
 ; PRED: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]}
+; PRED: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; PRED: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
index 75250df79d17..c24c1a38177d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -50,10 +50,10 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
 ; DEFAULT-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4
 ; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP18]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT5]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT4]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[TMP19:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[BROADCAST_SPLAT]], i32 -1)
-; DEFAULT-NEXT:    [[TMP20]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x i32> [[BROADCAST_SPLAT5]], i32 -1)
-; DEFAULT-NEXT:    [[TMP21:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP19]], i32 -1)
-; DEFAULT-NEXT:    [[TMP22:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> [[TMP19]], <vscale x 4 x i32> [[TMP20]], i32 -1)
+; DEFAULT-NEXT:    [[TMP19:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[BROADCAST_SPLAT]], i32 -1)
+; DEFAULT-NEXT:    [[TMP20]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x i32> [[BROADCAST_SPLAT5]], i32 -1)
+; DEFAULT-NEXT:    [[TMP21:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP19]], i32 -1)
+; DEFAULT-NEXT:    [[TMP22:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[TMP19]], <vscale x 4 x i32> [[TMP20]], i32 -1)
 ; DEFAULT-NEXT:    [[TMP23:%.*]] = or <vscale x 4 x i32> [[TMP21]], [[BROADCAST_SPLAT7]]
 ; DEFAULT-NEXT:    [[TMP24:%.*]] = or <vscale x 4 x i32> [[TMP22]], [[BROADCAST_SPLAT7]]
 ; DEFAULT-NEXT:    [[TMP25:%.*]] = lshr <vscale x 4 x i32> [[BROADCAST_SPLAT7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
@@ -197,8 +197,8 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
 ; PRED-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP21]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; PRED-NEXT:    [[TMP22]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[BROADCAST_SPLAT]], i32 -1)
-; PRED-NEXT:    [[TMP23:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP22]], i32 -1)
+; PRED-NEXT:    [[TMP22]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[BROADCAST_SPLAT]], i32 -1)
+; PRED-NEXT:    [[TMP23:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP22]], i32 -1)
 ; PRED-NEXT:    [[TMP24:%.*]] = or <vscale x 4 x i32> [[TMP23]], [[BROADCAST_SPLAT4]]
 ; PRED-NEXT:    [[TMP25:%.*]] = lshr <vscale x 4 x i32> [[BROADCAST_SPLAT4]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; PRED-NEXT:    [[TMP26:%.*]] = shl <vscale x 4 x i32> [[TMP24]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index ad6e8534f318..ddc004657ed5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -588,7 +588,7 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]]
 ; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0
 ; CHECK-UNORDERED-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x float>, ptr [[TMP11]], align 4
-; CHECK-UNORDERED-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_VEC]])
+; CHECK-UNORDERED-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_VEC]])
 ; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
 ; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
 ; CHECK-UNORDERED-NEXT:    [[TMP14]] = fadd <vscale x 4 x float> [[TMP12]], [[VEC_PHI1]]
@@ -658,7 +658,7 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]]
 ; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
 ; CHECK-ORDERED-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x float>, ptr [[TMP9]], align 4
-; CHECK-ORDERED-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_VEC]])
+; CHECK-ORDERED-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_VEC]])
 ; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
 ; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
 ; CHECK-ORDERED-NEXT:    [[TMP12]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP11]])
@@ -733,9 +733,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 8 x i1> @llvm.experimental.vector.interleave2.nxv8i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-ORDERED-TF-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP15]], i32 4, <vscale x 8 x i1> [[INTERLEAVED_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_MASKED_VEC]])
+; CHECK-ORDERED-TF-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_MASKED_VEC]])
 ; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
 ; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[TMP17]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index c07b3c8d4922..1853e551806b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -38,7 +38,7 @@ define void @test_array_load2_store2(i32 %C, i32 %D) #1 {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
@@ -46,7 +46,7 @@ define void @test_array_load2_store2(i32 %C, i32 %D) #1 {
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw <vscale x 4 x i32> [[TMP4]], [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 -4
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
 ; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
@@ -134,7 +134,7 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i64> [[TMP7]], i64 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -4
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP12]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP12]])
 ; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
@@ -216,7 +216,7 @@ define void @test_array_load2_store2_i16(i32 noundef %C, i32 noundef %D) #1 {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = or disjoint <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
@@ -401,11 +401,11 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
+; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
+; CHECK-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 1
@@ -414,9 +414,9 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[TMP17:%.*]] = sub nsw i32 1, [[TMP16]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP18]]
-; CHECK-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP12]])
-; CHECK-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP13]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE2]], <vscale x 4 x i32> [[REVERSE3]])
+; CHECK-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP12]])
+; CHECK-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP13]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE2]], <vscale x 4 x i32> [[REVERSE3]])
 ; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP19]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
@@ -483,7 +483,7 @@ define void @even_load_static_tc(ptr noalias nocapture readonly %A, ptr noalias
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl nsw <vscale x 4 x i32> [[TMP5]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP7:%.*]] = and i64 [[INDEX]], 9223372036854775804
@@ -569,7 +569,7 @@ define void @even_load_dynamic_tc(ptr noalias nocapture readonly %A, ptr noalias
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP12]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = shl nsw <vscale x 4 x i32> [[TMP13]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP15:%.*]] = and i64 [[INDEX]], 9223372036854775804
@@ -717,18 +717,18 @@ define void @mixed_load2_store2(ptr noalias nocapture readonly %A, ptr noalias n
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw <vscale x 4 x i32> [[TMP4]], [[TMP3]]
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <vscale x 4 x i32> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 -4
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP9]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP9]])
 ; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
@@ -811,7 +811,7 @@ define void @int_float_struct(ptr nocapture readonly %p) #0 {
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> zeroinitializer, i32 undef, i32 0), [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 4 x i32> [[TMP4]] to <vscale x 4 x float>
@@ -910,7 +910,7 @@ define void @PR27626_0(ptr %p, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x ptr> [[TMP12]], i64 0
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -994,12 +994,12 @@ define i32 @PR27626_1(ptr %p, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[VEC_IND]], i32 1
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP12]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x ptr> [[TMP13]], i64 0
 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP15]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
 ; CHECK-NEXT:    [[TMP17]] = add <vscale x 4 x i32> [[TMP16]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -1092,7 +1092,7 @@ define void @PR27626_2(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[VEC_IND]], i32 1
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x ptr> [[TMP14]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -1181,11 +1181,11 @@ define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[TMP12]], i32 1
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x ptr> [[TMP15]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
 ; CHECK-NEXT:    [[TMP18]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -1291,7 +1291,7 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -4
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x i32> [[BROADCAST_SPLAT4]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x i32> [[BROADCAST_SPLAT4]])
 ; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
@@ -1497,7 +1497,7 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP21:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[A]], <vscale x 4 x i64> [[TMP19]]
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER4]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP22]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison), !alias.scope [[META34]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_MASKED_GATHER4]], i32 -1)
+; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_MASKED_GATHER4]], i32 -1)
 ; CHECK-NEXT:    [[TMP24:%.*]] = sext <vscale x 4 x i16> [[TMP23]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP25:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER4]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP26:%.*]] = mul nsw <vscale x 4 x i32> [[TMP24]], [[TMP21]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 3ba91360850e..726d98f4d37d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -52,9 +52,9 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[INDEX]], 1
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP10]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
-; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = or disjoint i32 [[TMP8]], 1
@@ -63,8 +63,8 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP13]] to i64
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 -1
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
 ; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP20]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
@@ -134,9 +134,9 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl i32 [[INDEX]], 1
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP9]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = or disjoint i32 [[TMP7]], 1
@@ -145,8 +145,8 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP13]] to i64
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 -1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP20]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
index 1dfa7f8fe18b..cf4d65318b7e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
@@ -178,7 +178,7 @@ define void @add_recur(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NOTF-NOT:     %{{.*}} = phi <vscale x 4 x i1>
 ; CHECK-NOTF:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-NOTF:         %[[LOAD]] = load <vscale x 4 x i32>
-; CHECK-NOTF:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-NOTF:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-NOTF:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-NOTF:         store <vscale x 4 x i32> %[[ADD]]
 
@@ -191,7 +191,7 @@ define void @add_recur(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-TF-NORED:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
 ; CHECK-TF-NORED:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-TF-NORED:         %[[LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0({{.*}} %[[ACTIVE_LANE_MASK]]
-; CHECK-TF-NORED:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-TF-NORED:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-TF-NORED:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-TF-NORED:         call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %[[ADD]], {{.*}} <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]])
 
@@ -204,7 +204,7 @@ define void @add_recur(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-TF-NOREC-NOT:     %{{.*}} = phi <vscale x 4 x i1>
 ; CHECK-TF-NOREC:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-TF-NOREC:         %[[LOAD]] = load <vscale x 4 x i32>
-; CHECK-TF-NOREC:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-TF-NOREC:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-TF-NOREC:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-TF-NOREC:         store <vscale x 4 x i32> %[[ADD]]
 
@@ -217,7 +217,7 @@ define void @add_recur(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-TF-NOREV:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
 ; CHECK-TF-NOREV:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-TF-NOREV:         %[[LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0({{.*}} %[[ACTIVE_LANE_MASK]]
-; CHECK-TF-NOREV:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-TF-NOREV:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-TF-NOREV:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-TF-NOREV:         call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %[[ADD]], {{.*}} <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]])
 
@@ -230,7 +230,7 @@ define void @add_recur(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-TF:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
 ; CHECK-TF:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-TF:         %[[LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0({{.*}} %[[ACTIVE_LANE_MASK]]
-; CHECK-TF:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-TF:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-TF:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-TF:         call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %[[ADD]], {{.*}} <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]])
 
@@ -243,7 +243,7 @@ define void @add_recur(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-TF-ONLYRED-NOT:     %{{.*}} = phi <vscale x 4 x i1>
 ; CHECK-TF-ONLYRED:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-TF-ONLYRED:         %[[LOAD]] = load <vscale x 4 x i32>
-; CHECK-TF-ONLYRED:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-TF-ONLYRED:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-TF-ONLYRED:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-TF-ONLYRED:         store <vscale x 4 x i32> %[[ADD]]
 
@@ -256,7 +256,7 @@ define void @add_recur(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEOVERSE-V1-NOT:     %{{.*}} = phi <vscale x 4 x i1>
 ; CHECK-NEOVERSE-V1:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-NEOVERSE-V1:         %[[LOAD]] = load <vscale x 4 x i32>
-; CHECK-NEOVERSE-V1:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-NEOVERSE-V1:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-NEOVERSE-V1:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-NEOVERSE-V1:         store <vscale x 4 x i32> %[[ADD]]
 
@@ -350,30 +350,30 @@ define void @reverse(ptr noalias %dst, ptr noalias %src) #0 {
 ; CHECK-NOTF:       vector.body:
 ; CHECK-NOTF-NOT:     %{{.*}} = phi <vscale x 4 x i1>
 ; CHECK-NOTF:         %[[LOAD:.*]] = load <vscale x 2 x double>, ptr
-; CHECK-NOTF:         %{{.*}} = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> %[[LOAD]])
+; CHECK-NOTF:         %{{.*}} = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> %[[LOAD]])
 
 ; CHECK-TF-NOREV-LABEL: @reverse(
 ; CHECK-TF-NOREV:       vector.body:
 ; CHECK-TF-NOREV-NOT:     %{{.*}} = phi <vscale x 4 x i1>
 ; CHECK-TF-NOREV:         %[[LOAD:.*]] = load <vscale x 2 x double>, ptr
-; CHECK-TF-NOREV:         %{{.*}} = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> %[[LOAD]])
+; CHECK-TF-NOREV:         %{{.*}} = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> %[[LOAD]])
 
 ; CHECK-TF-LABEL: @reverse(
 ; CHECK-TF:       vector.body:
 ; CHECK-TF:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 2 x i1>
-; CHECK-TF:         %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
+; CHECK-TF:         %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
 ; CHECK-TF:         %[[MASKED_LOAD:.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0({{.*}} <vscale x 2 x i1> %reverse
 
 ; CHECK-TF-NORED-LABEL: @reverse(
 ; CHECK-TF-NORED:       vector.body:
 ; CHECK-TF-NORED:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 2 x i1>
-; CHECK-TF-NORED:         %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
+; CHECK-TF-NORED:         %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
 ; CHECK-TF-NORED:         %[[MASKED_LOAD:.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0({{.*}} <vscale x 2 x i1> %reverse
 
 ; CHECK-TF-NOREC-LABEL: @reverse(
 ; CHECK-TF-NOREC:       vector.body:
 ; CHECK-TF-NOREC:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 2 x i1>
-; CHECK-TF-NOREC:         %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
+; CHECK-TF-NOREC:         %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
 ; CHECK-TF-NOREC:         %[[MASKED_LOAD:.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0({{.*}} <vscale x 2 x i1> %reverse
 
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
index 70833e44b075..9485d827ced4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
@@ -1,5 +1,5 @@
 ; This is the loop in c++ being vectorize in this file with
-; experimental.vector.reverse
+; vector.reverse
 
 ;#pragma clang loop vectorize_width(4, scalable)
 ;  for (long int i = N - 1; i >= 0; i--)
@@ -18,12 +18,12 @@ target triple = "aarch64-unknown-linux-gnu"
 
 define void @vector_reverse_mask_nxv4i1(ptr %a, ptr %cond, i64 %N) #0 {
 ; CHECK-LABEL: vector.body:
-; CHECK: %[[REVERSE6:.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
+; CHECK: %[[REVERSE6:.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
 ; CHECK: %[[WIDEMSKLOAD:.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %{{.*}}, i32 8, <vscale x 4 x i1> %[[REVERSE6]], <vscale x 4 x double> poison)
-; CHECK: %[[REVERSE7:.*]] = call <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double> %[[WIDEMSKLOAD]])
+; CHECK: %[[REVERSE7:.*]] = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %[[WIDEMSKLOAD]])
 ; CHECK: %[[FADD:.*]] = fadd <vscale x 4 x double> %[[REVERSE7]]
-; CHECK: %[[REVERSE9:.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
-; CHECK: %[[REVERSE8:.*]] = call <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double> %[[FADD]])
+; CHECK: %[[REVERSE9:.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
+; CHECK: %[[REVERSE8:.*]] = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %[[FADD]])
 ; CHECK: call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> %[[REVERSE8]], ptr %{{.*}}, i32 8, <vscale x 4 x i1> %[[REVERSE9]]
 
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
index e35a4db36905..e3bba1338e1d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; This is the loop in c++ being vectorize in this file with
-;experimental.vector.reverse
+;vector.reverse
 ;  #pragma clang loop vectorize_width(8, scalable) interleave_count(2)
 ;  for (int i = N-1; i >= 0; --i)
 ;    a[i] = b[i] + 1.0;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
index 126ceac7325a..61105e51cb94 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
@@ -40,10 +40,10 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <vscale x 8 x i32>, ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC2]])
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC2]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC3]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC3]], 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <vscale x 4 x i32> [[TMP11]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
index 5c1966fa7a2d..0f524561eadc 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
@@ -107,9 +107,10 @@ while.body:
   %1 = load i8, ptr %b.addr.07, align 1
   %add = add i8 %1, %0
   %incdec.ptr4 = getelementptr inbounds i8, ptr %c.addr.08, i32 1
-  store i8 %add, ptr %c.addr.08, align 1
   %cmp = icmp sgt i32 %N.addr.09, 1
   %select = select i1 %cmp, i8 %0, i8 %1
+  %add2 = add i8 %add, %select
+  store i8 %add2, ptr %c.addr.08, align 1
   br i1 %cmp, label %while.body, label %while.end.loopexit
 
 while.end.loopexit:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 72d9691b2bb8..c3374fceb1fb 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; This is the loop in c++ being vectorize in this file with
-;experimental.vector.reverse
+;vector.reverse
 ;  #pragma clang loop vectorize_width(4, scalable)
 ;  for (int i = N-1; i >= 0; --i)
 ;    a[i] = b[i] + 1.0;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index f2222e0a1f93..0dee4a9b8585 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -46,9 +46,9 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = sub i64 1, [[TMP14]]
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP15]]
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]]
-; IF-EVL-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; IF-EVL-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
 ; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[REVERSE]], <vscale x 4 x i32> poison)
-; IF-EVL-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]])
+; IF-EVL-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]])
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
@@ -56,8 +56,8 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL-NEXT:    [[TMP23:%.*]] = sub i64 1, [[TMP21]]
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP22]]
 ; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]]
-; IF-EVL-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
-; IF-EVL-NEXT:    [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[REVERSE3]])
+; IF-EVL-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; IF-EVL-NEXT:    [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[REVERSE3]])
 ; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[REVERSE5]], ptr [[TMP25]], i32 4, <vscale x 4 x i1> [[REVERSE4]])
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; IF-EVL-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
index 3be31c011eaa..d64755999635 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
@@ -21,7 +21,7 @@ define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) {
 ; CHECK-VF4UF1: %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %[[NEXT_IDX:.*]], %vector.body ]
 ; CHECK-VF4UF1: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[VEC_RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-VF4UF1: %[[LOAD]] = load <vscale x 4 x i32>, ptr
-; CHECK-VF4UF1: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VEC_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-VF4UF1: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VEC_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-VF4UF1: middle.block:
 ; CHECK-VF4UF1: %[[VSCALE2:.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-VF4UF1: %[[MUL2:.*]] = mul i32 %[[VSCALE2]], 4
@@ -70,7 +70,7 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF1: vector.body:
 ; CHECK-VF4UF1: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[VEC_RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-VF4UF1: %[[LOAD]] = load <vscale x 4 x i32>, ptr
-; CHECK-VF4UF1: %[[REVERSE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VEC_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-VF4UF1: %[[REVERSE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VEC_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-VF4UF1: middle.block:
 ; CHECK-VF4UF1: %[[VSCALE2:.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-VF4UF1: %[[MUL2:.*]] = mul i32 %[[VSCALE2]], 4
@@ -119,7 +119,7 @@ define void @recurrence_3(ptr nocapture readonly %a, ptr nocapture %b, i32 %n, f
 ; CHECK-VF4UF1: vector.body:
 ; CHECK-VF4UF1: %vector.recur = phi <vscale x 4 x i16> [ %vector.recur.init, %vector.ph ], [ %[[L1:.*]], %vector.body ]
 ; CHECK-VF4UF1: %[[L1]] = load <vscale x 4 x i16>, ptr
-; CHECK-VF4UF1: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> %vector.recur, <vscale x 4 x i16> %[[L1]], i32 -1)
+; CHECK-VF4UF1: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %vector.recur, <vscale x 4 x i16> %[[L1]], i32 -1)
 ; Check also that the casts were not moved needlessly.
 ; CHECK-VF4UF1: sitofp <vscale x 4 x i16> %[[L1]] to <vscale x 4 x double>
 ; CHECK-VF4UF1: sitofp <vscale x 4 x i16> %[[SPLICE]] to <vscale x 4 x double>
@@ -169,8 +169,8 @@ define i64 @constant_folded_previous_value() {
 ; CHECK-VF4UF2-LABEL: @constant_folded_previous_value
 ; CHECK-VF4UF2: vector.body
 ; CHECK-VF4UF2: %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i64> [ %vector.recur.init, %vector.ph ], [ shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), %vector.body ]
-; CHECK-VF4UF2: %[[SPLICE1:.*]] = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> %vector.recur, <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), i32 -1)
-; CHECK-VF4UF2: %[[SPLICE2:.*]] = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), i32 -1)
+; CHECK-VF4UF2: %[[SPLICE1:.*]] = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> %vector.recur, <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), i32 -1)
+; CHECK-VF4UF2: %[[SPLICE2:.*]] = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), i32 -1)
 ; CHECK-VF4UF2: br i1 {{.*}}, label %middle.block, label %vector.body
 entry:
   br label %scalar.body
@@ -242,7 +242,7 @@ define void @sink_after(ptr %a, ptr %b, i64 %n) {
 ; CHECK-VF4UF1: vector.body
 ; CHECK-VF4UF1: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i16> [ %vector.recur.init, %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-VF4UF1: %[[LOAD]] = load <vscale x 4 x i16>, ptr
-; CHECK-VF4UF1-NEXT: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> %[[VEC_RECUR]], <vscale x 4 x i16> %[[LOAD]], i32 -1)
+; CHECK-VF4UF1-NEXT: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %[[VEC_RECUR]], <vscale x 4 x i16> %[[LOAD]], i32 -1)
 ; CHECK-VF4UF1-NEXT: sext <vscale x 4 x i16> %[[SPLICE]] to <vscale x 4 x i32>
 ; CHECK-VF4UF1-NEXT: sext <vscale x 4 x i16> %[[LOAD]] to <vscale x 4 x i32>
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
index 690772472975..3771ec4bda88 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
@@ -5,12 +5,7 @@ define void @h() {
 ; CHECK-LABEL: define void @h() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 0, i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc <8 x i32> [[TMP0]] to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i1> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i1> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i1> [[TMP4]] to <8 x i16>
-; CHECK-NEXT:    store <8 x i16> [[TMP3]], ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr [[ARRAYIDX2]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll
index d51ef0bce3a4..76bb882171b1 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll
@@ -5,7 +5,8 @@ define void @h() {
 ; CHECK-LABEL: define void @h() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc <8 x i32> zeroinitializer to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 0 to i1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 poison, i1 false, i1 false, i1 false>, i1 [[TMP6]], i32 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub <8 x i1> [[TMP0]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i1> [[TMP0]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll
index 6404cf4a2cd1..2ab6e919c23b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll
@@ -5,7 +5,12 @@ define void @h() {
 ; CHECK-LABEL: define void @h() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
-; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 0 to i1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 poison, i1 false, i1 false, i1 false>, i1 [[TMP0]], i32 4
+; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i1> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i1> zeroinitializer, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i1> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    store <8 x i16> [[TMP4]], ptr [[ARRAYIDX2]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/minbw-with-and-and-scalar-trunc.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/minbw-with-and-and-scalar-trunc.ll
new file mode 100644
index 000000000000..fc977585614b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/minbw-with-and-and-scalar-trunc.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mattr=+v < %s | FileCheck %s
+@c = global [12 x i64] zeroinitializer
+
+; FIXME: after minbitwidth analysis and i32 conv.., 65535 is transformed to
+; and <4 x i16> , -1, which must be dropped.
+; FIXME: need to adjust the cost of the final transformation, since the user is
+; just a trunc to i16 (it must be free).
+define i16 @test() {
+; CHECK-LABEL: define i16 @test(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr align 8 @c, i64 24, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[TMP0]] to <4 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i16> [[TMP1]], <i16 -1, i16 -1, i16 -1, i16 -1>
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i16> [[TMP2]], <i16 -1, i16 -1, i16 -1, i16 -1>
+; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[T:%.*]] = trunc i32 [[TMP5]] to i16
+; CHECK-NEXT:    ret i16 [[T]]
+;
+entry:
+  %0 = load i64, ptr @c, align 8
+  %conv = trunc i64 %0 to i32
+  %conv3 = and i32 %conv, 65535
+  %conv4 = xor i32 %conv3, 65535
+  %1 = load i64, ptr getelementptr inbounds ([12 x i64], ptr @c, i64 0, i64 3), align 8
+  %conv.1 = trunc i64 %1 to i32
+  %conv3.1 = and i32 %conv.1, 65535
+  %conv4.1 = xor i32 %conv3.1, 65535
+  %.conv4.1 = tail call i32 @llvm.umax.i32(i32 %conv4, i32 %conv4.1)
+  %2 = load i64, ptr getelementptr inbounds ([12 x i64], ptr @c, i64 0, i64 6), align 8
+  %conv.2 = trunc i64 %2 to i32
+  %conv3.2 = and i32 %conv.2, 65535
+  %conv4.2 = xor i32 %conv3.2, 65535
+  %.conv4.2 = tail call i32 @llvm.umax.i32(i32 %.conv4.1, i32 %conv4.2)
+  %3 = load i64, ptr getelementptr inbounds ([12 x i64], ptr @c, i64 0, i64 9), align 8
+  %conv.3 = trunc i64 %3 to i32
+  %conv3.3 = and i32 %conv.3, 65535
+  %conv4.3 = xor i32 %conv3.3, 65535
+  %.conv4.3 = tail call i32 @llvm.umax.i32(i32 %.conv4.2, i32 %conv4.3)
+  %t = trunc i32 %.conv4.3 to i16
+  ret i16 %t
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-stores-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-stores-vectorized.ll
new file mode 100644
index 000000000000..0dfa45da9d87
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-stores-vectorized.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=riscv64-unknown-linux -mattr=+v | FileCheck %s
+
+define void @store_reverse(ptr %p3) {
+; CHECK-LABEL: @store_reverse(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[P3:%.*]], align 8
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 7
+; CHECK-NEXT:    store i64 [[SHL]], ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 9
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[SHL5:%.*]] = shl i64 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 6
+; CHECK-NEXT:    store i64 [[SHL5]], ptr [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[ARRAYIDX7]], align 8
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 10
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX8]], align 8
+; CHECK-NEXT:    [[SHL9:%.*]] = shl i64 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 5
+; CHECK-NEXT:    store i64 [[SHL9]], ptr [[ARRAYIDX10]], align 8
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[ARRAYIDX11]], align 8
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 11
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[ARRAYIDX12]], align 8
+; CHECK-NEXT:    [[SHL13:%.*]] = shl i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 4
+; CHECK-NEXT:    store i64 [[SHL13]], ptr [[ARRAYIDX14]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i64, ptr %p3, align 8
+  %arrayidx1 = getelementptr inbounds i64, ptr %p3, i64 8
+  %1 = load i64, ptr %arrayidx1, align 8
+  %shl = shl i64 %0, %1
+  %arrayidx2 = getelementptr inbounds i64, ptr %p3, i64 7
+  store i64 %shl, ptr %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds i64, ptr %p3, i64 1
+  %2 = load i64, ptr %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds i64, ptr %p3, i64 9
+  %3 = load i64, ptr %arrayidx4, align 8
+  %shl5 = shl i64 %2, %3
+  %arrayidx6 = getelementptr inbounds i64, ptr %p3, i64 6
+  store i64 %shl5, ptr %arrayidx6, align 8
+  %arrayidx7 = getelementptr inbounds i64, ptr %p3, i64 2
+  %4 = load i64, ptr %arrayidx7, align 8
+  %arrayidx8 = getelementptr inbounds i64, ptr %p3, i64 10
+  %5 = load i64, ptr %arrayidx8, align 8
+  %shl9 = shl i64 %4, %5
+  %arrayidx10 = getelementptr inbounds i64, ptr %p3, i64 5
+  store i64 %shl9, ptr %arrayidx10, align 8
+  %arrayidx11 = getelementptr inbounds i64, ptr %p3, i64 3
+  %6 = load i64, ptr %arrayidx11, align 8
+  %arrayidx12 = getelementptr inbounds i64, ptr %p3, i64 11
+  %7 = load i64, ptr %arrayidx12, align 8
+  %shl13 = shl i64 %6, %7
+  %arrayidx14 = getelementptr inbounds i64, ptr %p3, i64 4
+  store i64 %shl13, ptr %arrayidx14, align 8
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll
index 7b4e2b0ce911..1bb87bf6205f 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll
@@ -7,9 +7,9 @@ define void @test(ptr %a, i8 %0, i16 %b.promoted.i) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP0]] to i128
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[B_PROMOTED_I]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i128> poison, i128 [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i128> [[TMP5]], <4 x i128> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc <4 x i128> [[TMP6]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP2]] to i16
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i16> poison, i16 [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP4]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> [[TMP8]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = zext i16 [[TMP9]] to i64
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll
new file mode 100644
index 000000000000..d80d7b5ecd4e
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mcpu=x86-64-v4 < %s | FileCheck %s
+
+%struct.rect = type { float, float, float, float }
+
+define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: ptr [[I7:%.*]], i32 [[TMP0:%.*]], i1 [[TOBOOL62_NOT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RC21:%.*]] = alloca [0 x [0 x %struct.rect]], i32 0, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[RC21]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x float>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[X1:%.*]] = getelementptr i8, ptr [[RC21]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, ptr [[X1]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[I7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = fcmp olt <4 x float> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = fcmp olt <4 x float> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP5]], <4 x float> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[TMP12]], <4 x float> zeroinitializer, <4 x float> [[TMP15]]
+; CHECK-NEXT:    store <4 x float> [[TMP16]], ptr [[RC21]], align 4
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       entry.if.end72_crit_edge:
+; CHECK-NEXT:    br label [[IF_END72:%.*]]
+; CHECK:       if.then63:
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x float> [ poison, [[IF_THEN63:%.*]] ], [ [[TMP16]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[TMP17]])
+; CHECK-NEXT:    [[TMP19:%.*]] = fptosi <4 x float> [[TMP18]] to <4 x i32>
+; CHECK-NEXT:    br label [[IF_END72]]
+; CHECK:       if.end72:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi <4 x i32> [ poison, [[ENTRY_IF_END72_CRIT_EDGE:%.*]] ], [ [[TMP19]], [[IF_END]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP20]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    br i1 [[TOBOOL62_NOT]], label [[IF_END75:%.*]], label [[IF_THEN74:%.*]]
+; CHECK:       if.then74:
+; CHECK-NEXT:    br label [[IF_END75]]
+; CHECK:       if.end75:
+; CHECK-NEXT:    [[TMP22:%.*]] = phi <4 x i32> [ [[TMP20]], [[IF_THEN74]] ], [ [[TMP21]], [[IF_END72]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <4 x i32> [[TMP22]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = mul <4 x i32> [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = sitofp <4 x i32> [[TMP25]] to <4 x float>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <4 x float> [[TMP26]], <4 x float> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    store <4 x float> [[TMP27]], ptr [[RC21]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %rc21 = alloca [0 x [0 x %struct.rect]], i32 0, align 4
+  %1 = load float, ptr %rc21, align 4
+  %cmp = fcmp olt float %1, 0.000000e+00
+  %conv = sitofp i32 %0 to float
+  %cmp2 = fcmp olt float %conv, 0.000000e+00
+  %cond = select i1 %cmp2, float %conv, float 0.000000e+00
+  %cond9 = select i1 %cmp, float 0.000000e+00, float %cond
+  store float %cond9, ptr %rc21, align 4
+  %x1 = getelementptr i8, ptr %rc21, i64 4
+  %2 = load float, ptr %x1, align 4
+  %cmp11 = fcmp olt float %2, 0.000000e+00
+  %conv16 = sitofp i32 %0 to float
+  %cmp17 = fcmp olt float %conv16, 0.000000e+00
+  %cond24 = select i1 %cmp17, float %conv16, float 0.000000e+00
+  %cond26 = select i1 %cmp11, float 0.000000e+00, float %cond24
+  store float %cond26, ptr %x1, align 4
+  %y0 = getelementptr i8, ptr %rc21, i64 8
+  %3 = load float, ptr %y0, align 4
+  %cmp28 = fcmp olt float %3, 0.000000e+00
+  %cmp34 = fcmp olt float %conv, 0.000000e+00
+  %cond41 = select i1 %cmp34, float %conv, float 0.000000e+00
+  %cond43 = select i1 %cmp28, float 0.000000e+00, float %cond41
+  store float %cond43, ptr %y0, align 4
+  %y11 = getelementptr i8, ptr %rc21, i64 12
+  %4 = load float, ptr %i7, align 4
+  %cmp45 = fcmp olt float %4, 0.000000e+00
+  %cmp51 = fcmp olt float %conv16, 0.000000e+00
+  %cond58 = select i1 %cmp51, float %conv16, float 0.000000e+00
+  %cond60 = select i1 %cmp45, float 0.000000e+00, float %cond58
+  store float %cond60, ptr %y11, align 4
+  br label %if.end
+
+entry.if.end72_crit_edge:
+  br label %if.end72
+
+if.then63:
+  br label %if.end
+
+if.end:
+  %5 = phi float [ 0.000000e+00, %if.then63 ], [ %cond60, %entry ]
+  %6 = phi float [ 0.000000e+00, %if.then63 ], [ %cond26, %entry ]
+  %7 = phi float [ 0.000000e+00, %if.then63 ], [ %cond43, %entry ]
+  %8 = phi float [ 0.000000e+00, %if.then63 ], [ %cond9, %entry ]
+  %9 = call float @llvm.round.f32(float %8)
+  %conv65 = fptosi float %9 to i32
+  %10 = call float @llvm.round.f32(float %7)
+  %conv67 = fptosi float %10 to i32
+  %11 = call float @llvm.round.f32(float %6)
+  %conv69 = fptosi float %11 to i32
+  %12 = call float @llvm.round.f32(float %5)
+  %conv71 = fptosi float %12 to i32
+  br label %if.end72
+
+if.end72:
+  %.pre100 = phi i32 [ 0, %entry.if.end72_crit_edge ], [ %conv71, %if.end ]
+  %.pre99 = phi i32 [ 0, %entry.if.end72_crit_edge ], [ %conv67, %if.end ]
+  %.pre98 = phi i32 [ 0, %entry.if.end72_crit_edge ], [ %conv69, %if.end ]
+  %.pre97 = phi i32 [ 0, %entry.if.end72_crit_edge ], [ %conv65, %if.end ]
+  br i1 %tobool62.not, label %if.end75, label %if.then74
+
+if.then74:
+  br label %if.end75
+
+if.end75:
+  %13 = phi i32 [ %.pre99, %if.then74 ], [ %.pre100, %if.end72 ]
+  %14 = phi i32 [ %.pre100, %if.then74 ], [ %.pre99, %if.end72 ]
+  %15 = phi i32 [ %.pre97, %if.then74 ], [ %.pre98, %if.end72 ]
+  %16 = phi i32 [ %.pre98, %if.then74 ], [ %.pre97, %if.end72 ]
+  %sub = or i32 %16, 1
+  %mul = mul i32 %sub, %0
+  %conv77 = sitofp i32 %mul to float
+  store float %conv77, ptr %rc21, align 4
+  %x178 = getelementptr i8, ptr %rc21, i64 4
+  %sub79 = or i32 %15, 1
+  %mul80 = mul i32 %sub79, %0
+  %conv81 = sitofp i32 %mul80 to float
+  store float %conv81, ptr %x178, align 4
+  %y082 = getelementptr i8, ptr %rc21, i64 8
+  %sub83 = or i32 %14, 1
+  %mul84 = mul i32 %sub83, %0
+  %conv85 = sitofp i32 %mul84 to float
+  store float %conv85, ptr %y082, align 4
+  %y186 = getelementptr i8, ptr %rc21, i64 12
+  %sub87 = or i32 %13, 1
+  %mul88 = mul i32 %sub87, %0
+  %conv89 = sitofp i32 %mul88 to float
+  store float %conv89, ptr %y186, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll
index 668d3c3c8c82..0ab56279fe47 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll
@@ -16,8 +16,7 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i8> [[TMP8]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i1> zeroinitializer, [[TMP15]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <4 x i1> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i1> [[TMP15]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP13]])
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll
index 867a49dbaed2..7258ffca1278 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll
@@ -1,5 +1,9 @@
-; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-profile.prof -pass-remarks=sample-profile -pass-remarks-output=%t.opt.yaml -sample-profile-use-profi=0 -S | FileCheck %s
-; RUN: FileCheck %s -check-prefix=YAML < %t.opt.yaml
+; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-profile.prof -pass-remarks=sample-profile -pass-remarks-output=%t.opt.yaml -sample-profile-use-profi=0 -S -o %t
+; RUN: FileCheck %s --input-file %t
+; RUN: FileCheck %s -check-prefix=YAML --input-file %t.opt.yaml
+; RUN: opt < %t -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-profile.prof -sample-profile-remove-probe -S | FileCheck %s -check-prefix=REMOVE-PROBE
+
+; REMOVE-PROBE-NOT: call void @llvm.pseudoprobe
 
 define dso_local i32 @foo(i32 %x, ptr %f) #0 !dbg !4 {
 entry:
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/endless-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/endless-unswitch.ll
index 0d3aa8b24310..e70bea2d2f7a 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/endless-unswitch.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/endless-unswitch.ll
@@ -106,3 +106,96 @@ for.inc:                                          ; preds = %for.cond5
   store i8 0, ptr @b, align 1
   br label %for.cond5
 }
+
+define void @e(ptr %p) {
+; CHECK-LABEL: @e(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[FOR_COND]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i16 [[TMP0]] to i1
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FOR_END_SPLIT:%.*]], label [[FOR_END_SPLIT_US:%.*]]
+; CHECK:       for.end.split.us:
+; CHECK-NEXT:    br label [[G_US:%.*]]
+; CHECK:       g.us:
+; CHECK-NEXT:    br label [[G_SPLIT_US6:%.*]]
+; CHECK:       for.cond1.us1:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[P]], align 2
+; CHECK-NEXT:    [[TOBOOL4_NOT_US:%.*]] = trunc i16 [[TMP2]] to i1
+; CHECK-NEXT:    br i1 [[TOBOOL4_NOT_US]], label [[FOR_COND5_PREHEADER_US4:%.*]], label [[G_LOOPEXIT_US:%.*]]
+; CHECK:       for.cond5.us2:
+; CHECK-NEXT:    br i1 false, label [[FOR_COND1_LOOPEXIT_US5:%.*]], label [[FOR_INC_US3:%.*]]
+; CHECK:       for.inc.us3:
+; CHECK-NEXT:    store i8 0, ptr @b, align 1
+; CHECK-NEXT:    br label [[FOR_COND5_US2:%.*]]
+; CHECK:       for.cond5.preheader.us4:
+; CHECK-NEXT:    br label [[FOR_COND5_US2]]
+; CHECK:       for.cond1.loopexit.us5:
+; CHECK-NEXT:    br label [[FOR_COND1_US1:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       g.loopexit.us:
+; CHECK-NEXT:    br label [[G_US]]
+; CHECK:       g.split.us6:
+; CHECK-NEXT:    br label [[FOR_COND1_US1]]
+; CHECK:       for.end.split:
+; CHECK-NEXT:    br label [[G:%.*]]
+; CHECK:       g.loopexit:
+; CHECK-NEXT:    br label [[G]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       g:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[P]], align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i16 [[TMP3]] to i1
+; CHECK-NEXT:    br i1 [[TMP4]], label [[G_SPLIT_US:%.*]], label [[G_SPLIT:%.*]]
+; CHECK:       g.split.us:
+; CHECK-NEXT:    br label [[FOR_COND1_US:%.*]]
+; CHECK:       for.cond1.us:
+; CHECK-NEXT:    br label [[FOR_COND5_PREHEADER_US:%.*]]
+; CHECK:       for.cond5.us:
+; CHECK-NEXT:    br i1 false, label [[FOR_COND1_LOOPEXIT_US:%.*]], label [[FOR_INC_US:%.*]]
+; CHECK:       for.inc.us:
+; CHECK-NEXT:    store i8 0, ptr @b, align 1
+; CHECK-NEXT:    br label [[FOR_COND5_US:%.*]]
+; CHECK:       for.cond5.preheader.us:
+; CHECK-NEXT:    br label [[FOR_COND5_US]]
+; CHECK:       for.cond1.loopexit.us:
+; CHECK-NEXT:    br label [[FOR_COND1_US]]
+; CHECK:       g.split:
+; CHECK-NEXT:    br label [[FOR_COND1:%.*]]
+; CHECK:       for.cond1.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND1]], !llvm.loop [[LOOP3]]
+; CHECK:       for.cond1:
+; CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[P]], align 2
+; CHECK-NEXT:    [[TOBOOL4_NOT:%.*]] = trunc i16 [[TMP5]] to i1
+; CHECK-NEXT:    br i1 [[TOBOOL4_NOT]], label [[FOR_COND5_PREHEADER:%.*]], label [[G_LOOPEXIT:%.*]]
+; CHECK:       for.cond5.preheader:
+; CHECK-NEXT:    br label [[FOR_COND5:%.*]]
+; CHECK:       for.cond5:
+; CHECK-NEXT:    br i1 false, label [[FOR_COND1_LOOPEXIT:%.*]], label [[FOR_INC:%.*]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    store i8 0, ptr @b, align 1
+; CHECK-NEXT:    br label [[FOR_COND5]]
+;
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond, %entry
+  br i1 false, label %for.end, label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  br label %g
+
+g:                                                ; preds = %for.cond1, %for.end
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.cond5, %g
+  %0 = load i16, ptr %p, align 2
+  %tobool4.not = trunc i16 %0 to i1
+  br i1 %tobool4.not, label %for.cond5, label %g
+
+for.cond5:                                        ; preds = %for.inc, %for.cond1
+  br i1 false, label %for.cond1, label %for.inc
+
+for.inc:                                          ; preds = %for.cond5
+  store i8 0, ptr @b, align 1
+  br label %for.cond5
+}
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll
index f97e5c3eec9d..1d8942079ffd 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll
@@ -1326,6 +1326,136 @@ exit:
   ret i32 10
 }
 
+define i32 @partial_unswitch_true_successor_trunc(ptr %ptr, i32 %N) {
+; CHECK-LABEL: @partial_unswitch_true_successor_trunc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[TMP0]] to i1
+; CHECK-NEXT:    br i1 [[TMP1]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split.us:
+; CHECK-NEXT:    br label [[LOOP_HEADER_US:%.*]]
+; CHECK:       loop.header.us:
+; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[LOOP_LATCH_US:%.*]] ]
+; CHECK-NEXT:    br label [[NOCLOBBER_US:%.*]]
+; CHECK:       noclobber.us:
+; CHECK-NEXT:    br label [[LOOP_LATCH_US]]
+; CHECK:       loop.latch.us:
+; CHECK-NEXT:    [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]]
+; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:    br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK:       exit.split.us:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[SC:%.*]] = trunc i32 [[LV]] to i1
+; CHECK-NEXT:    br i1 [[SC]], label [[NOCLOBBER:%.*]], label [[CLOBBER:%.*]]
+; CHECK:       noclobber:
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       clobber:
+; CHECK-NEXT:    call void @clobber()
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       exit.split:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 10
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %lv = load i32, ptr %ptr
+  %sc = trunc i32 %lv to i1
+  br i1 %sc, label %noclobber, label %clobber
+
+noclobber:
+  br label %loop.latch
+
+clobber:
+  call void @clobber()
+  br label %loop.latch
+
+loop.latch:
+  %c = icmp ult i32 %iv, %N
+  %iv.next = add i32 %iv, 1
+  br i1 %c, label %loop.header, label %exit
+
+exit:
+  ret i32 10
+}
+
+define i32 @partial_unswitch_false_successor_trunc(ptr %ptr, i32 %N) {
+; CHECK-LABEL: @partial_unswitch_false_successor_trunc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[TMP0]] to i1
+; CHECK-NEXT:    br i1 [[TMP1]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_US:%.*]]
+; CHECK:       entry.split.us:
+; CHECK-NEXT:    br label [[LOOP_HEADER_US:%.*]]
+; CHECK:       loop.header.us:
+; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[LOOP_LATCH_US:%.*]] ]
+; CHECK-NEXT:    br label [[NOCLOBBER_US:%.*]]
+; CHECK:       noclobber.us:
+; CHECK-NEXT:    br label [[LOOP_LATCH_US]]
+; CHECK:       loop.latch.us:
+; CHECK-NEXT:    [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]]
+; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:    br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK:       exit.split.us:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[SC:%.*]] = trunc i32 [[LV]] to i1
+; CHECK-NEXT:    br i1 [[SC]], label [[CLOBBER:%.*]], label [[NOCLOBBER:%.*]]
+; CHECK:       clobber:
+; CHECK-NEXT:    call void @clobber()
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       noclobber:
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       exit.split:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 10
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %lv = load i32, ptr %ptr
+  %sc = trunc i32 %lv to i1
+  br i1 %sc, label %clobber, label %noclobber
+
+clobber:
+  call void @clobber()
+  br label %loop.latch
+
+noclobber:
+  br label %loop.latch
+
+loop.latch:
+  %c = icmp ult i32 %iv, %N
+  %iv.next = add i32 %iv, 1
+  br i1 %c, label %loop.header, label %exit
+
+exit:
+  ret i32 10
+}
+
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[UNSWITCH_PARTIAL_DISABLE:![0-9]+]]}
 ; CHECK: [[UNSWITCH_PARTIAL_DISABLE]] = !{!"llvm.loop.unswitch.partial.disable"}
 ; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[UNSWITCH_PARTIAL_DISABLE]]}
diff --git a/llvm/test/Verifier/invalid-splice.ll b/llvm/test/Verifier/invalid-splice.ll
index d5096bdf17ca..2239386df562 100644
--- a/llvm/test/Verifier/invalid-splice.ll
+++ b/llvm/test/Verifier/invalid-splice.ll
@@ -2,36 +2,36 @@
 
 ; CHECK: The splice index exceeds the range [-VL, VL-1] where VL is the known minimum number of elements in the vector
 define <2 x double> @splice_v2f64_idx_neg3(<2 x double> %a, <2 x double> %b) #0 {
-  %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 -3)
+  %res = call <2 x double> @llvm.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 -3)
   ret <2 x double> %res
 }
 
 ; CHECK: The splice index exceeds the range [-VL, VL-1] where VL is the known minimum number of elements in the vector
 define <vscale x 2 x double> @splice_nxv2f64_idx_neg3_vscale_min1(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -3)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -3)
   ret <vscale x 2 x double> %res
 }
 
 ; CHECK: The splice index exceeds the range [-VL, VL-1] where VL is the known minimum number of elements in the vector
 define <vscale x 2 x double> @splice_nxv2f64_idx_neg5_vscale_min2(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #1 {
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -5)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -5)
   ret <vscale x 2 x double> %res
 }
 
 ; CHECK: The splice index exceeds the range [-VL, VL-1] where VL is the known minimum number of elements in the vector
 define <2 x double> @splice_v2f64_idx2(<2 x double> %a, <2 x double> %b) #0 {
-  %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 2)
+  %res = call <2 x double> @llvm.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 2)
   ret <2 x double> %res
 }
 
 ; CHECK: The splice index exceeds the range [-VL, VL-1] where VL is the known minimum number of elements in the vector
 define <2 x double> @splice_v2f64_idx3(<2 x double> %a, <2 x double> %b) #1 {
-  %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 4)
+  %res = call <2 x double> @llvm.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 4)
   ret <2 x double> %res
 }
 
 attributes #0 = { vscale_range(1,16) }
 attributes #1 = { vscale_range(2,16) }
 
-declare <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double>, <2 x double>, i32)
-declare <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
+declare <2 x double> @llvm.vector.splice.v2f64(<2 x double>, <2 x double>, i32)
+declare <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
diff --git a/llvm/test/tools/llvm-driver/passthrough-lld.test b/llvm/test/tools/llvm-driver/passthrough-lld.test
index acd5f3878481..b31fa4e483b9 100644
--- a/llvm/test/tools/llvm-driver/passthrough-lld.test
+++ b/llvm/test/tools/llvm-driver/passthrough-lld.test
@@ -1,6 +1,8 @@
 # REQUIRES: llvm-driver, lld
 
 # RUN: %llvm ld.lld --help | FileCheck %s
+# RUN: %llvm ld --help | FileCheck %s
 # RUN: %llvm lld -flavor ld.lld --help | FileCheck %s
+# RUN: %llvm ld -flavor ld.lld --help | FileCheck %s
 
 # CHECK: supported targets: elf
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/skip-unsupported-instructions-none-remain.s b/llvm/test/tools/llvm-mca/X86/BtVer2/skip-unsupported-instructions-none-remain.s
new file mode 100644
index 000000000000..0d67f53e12f1
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/skip-unsupported-instructions-none-remain.s
@@ -0,0 +1,14 @@
+# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -skip-unsupported-instructions %s 2>&1 | FileCheck --check-prefixes=CHECK-ALL,CHECK-SKIP %s
+# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 %s 2>&1 | FileCheck --check-prefixes=CHECK-ALL,CHECK-ERROR %s
+
+# Test defends that if all instructions are skipped leaving an empty input, an error is printed.
+
+bzhi %eax, %ebx, %ecx
+
+# CHECK-ALL-NOT: error
+
+# CHECK-ERROR: error: found an unsupported instruction in the input assembly sequence, use -skip-unsupported-instructions to ignore.
+
+# CHECK-SKIP: warning: found an unsupported instruction in the input assembly sequence, skipping with -skip-unsupported-instructions, note accuracy will be impacted:
+# CHECK-SKIP: note: instruction:      bzhil   %eax, %ebx, %ecx
+# CHECK-SKIP: error: no assembly instructions found.
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/unsupported-instruction.s b/llvm/test/tools/llvm-mca/X86/BtVer2/unsupported-instruction.s
index bb88e951c129..3690a1101be9 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/unsupported-instruction.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/unsupported-instruction.s
@@ -1,6 +1,55 @@
-# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 %s 2>&1 | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -skip-unsupported-instructions -timeline %s 2>&1 | FileCheck --check-prefix=CHECK-SKIP %s
+# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
+
+# Test checks that unsupported instructions exit with an error, unless -skip-unsupported-instructions is passed, in which case the remaining instructions should be analysed.
+
+# CHECK-SKIP: warning: found an unsupported instruction in the input assembly sequence, skipping with -skip-unsupported-instructions, note accuracy will be impacted:
+# CHECK-ERROR: error: found an unsupported instruction in the input assembly sequence, use -skip-unsupported-instructions to ignore.
 
 bzhi %eax, %ebx, %ecx
 
-# CHECK: error: found an unsupported instruction in the input assembly sequence.
-# CHECK-NEXT: note: instruction: 	bzhil	%eax, %ebx, %ecx
+# Supported instruction that may be analysed.
+add %eax, %eax
+
+# CHECK-SKIP: Iterations:        100
+# CHECK-SKIP: Instructions:      100
+# CHECK-SKIP: Total Cycles:      103
+# CHECK-SKIP: Total uOps:        100
+
+# CHECK-SKIP: Dispatch Width:    2
+# CHECK-SKIP: uOps Per Cycle:    0.97
+# CHECK-SKIP: IPC:               0.97
+# CHECK-SKIP: Block RThroughput: 0.5
+
+# CHECK-SKIP: Instruction Info:
+# CHECK-SKIP: [1]: #uOps
+# CHECK-SKIP: [2]: Latency
+# CHECK-SKIP: [3]: RThroughput
+# CHECK-SKIP: [4]: MayLoad
+# CHECK-SKIP: [5]: MayStore
+# CHECK-SKIP: [6]: HasSideEffects (U)
+
+# CHECK-SKIP: [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-SKIP:  1      1     0.50                        addl  %eax, %eax
+
+# CHECK-SKIP: Timeline view:
+
+# CHECK-SKIP: [0,0]     DeER .    . .   addl  %eax, %eax
+# CHECK-SKIP: [1,0]     D=eER.    . .   addl  %eax, %eax
+# CHECK-SKIP: [2,0]     .D=eER    . .   addl  %eax, %eax
+# CHECK-SKIP: [3,0]     .D==eER   . .   addl  %eax, %eax
+# CHECK-SKIP: [4,0]     . D==eER  . .   addl  %eax, %eax
+# CHECK-SKIP: [5,0]     . D===eER . .   addl  %eax, %eax
+# CHECK-SKIP: [6,0]     .  D===eER. .   addl  %eax, %eax
+# CHECK-SKIP: [7,0]     .  D====eER .   addl  %eax, %eax
+# CHECK-SKIP: [8,0]     .   D====eER.   addl  %eax, %eax
+# CHECK-SKIP: [9,0]     .   D=====eER   addl  %eax, %eax
+
+# CHECK-SKIP: Average Wait times (based on the timeline view):
+# CHECK-SKIP: [0]: Executions
+# CHECK-SKIP: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-SKIP: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-SKIP: [3]: Average time elapsed from WB until retire stage
+
+# CHECK-SKIP:       [0]    [1]    [2]    [3]
+# CHECK-SKIP: 0.     10    3.5    0.1    0.0       addl       %eax, %eax
diff --git a/llvm/test/tools/llvm-rc/Inputs/dialog-with-menu.rc b/llvm/test/tools/llvm-rc/Inputs/dialog-with-menu.rc
new file mode 100644
index 000000000000..bb79dca399c2
--- /dev/null
+++ b/llvm/test/tools/llvm-rc/Inputs/dialog-with-menu.rc
@@ -0,0 +1,16 @@
+101 DIALOG 0, 0, 362, 246
+STYLE 0x40l | 0x0004l | 0x0008l | 0x0800l | 0x00020000l |
+    0x00010000l | 0x80000000l | 0x10000000l | 0x02000000l | 0x00C00000l |
+    0x00080000l | 0x00040000l
+CAPTION "MakeNSISW"
+MENU 104
+FONT 8, "MS Shell Dlg"
+BEGIN
+    CONTROL "",202,"RichEdit20A",0x0004l | 0x0040l |
+                    0x0100l | 0x0800l | 0x00008000 |
+                    0x00010000l | 0x00800000l | 0x00200000l,7,22,348,190
+    CONTROL "",-1,"Static",0x00000010l,7,220,346,1
+    LTEXT "",200,7,230,200,12,0x08000000l
+    DEFPUSHBUTTON "Test &Installer",203,230,226,60,15,0x08000000l | 0x00010000l
+    PUSHBUTTON "&Close",2,296,226,49,15,0x00010000l
+END
diff --git a/llvm/test/tools/llvm-rc/dialog-with-menu.test b/llvm/test/tools/llvm-rc/dialog-with-menu.test
new file mode 100644
index 000000000000..2529e9c1722b
--- /dev/null
+++ b/llvm/test/tools/llvm-rc/dialog-with-menu.test
@@ -0,0 +1,32 @@
+; RUN: llvm-rc -no-preprocess /FO %t -- %p/Inputs/dialog-with-menu.rc
+; RUN: llvm-readobj %t | FileCheck %s
+
+CHECK: Resource type (int): DIALOG (ID 5)
+CHECK-NEXT: Resource name (int): 101
+CHECK-NEXT: Data version: 0
+CHECK-NEXT: Memory flags: 0x1030
+CHECK-NEXT: Language ID: 1033
+CHECK-NEXT: Version (major): 0
+CHECK-NEXT: Version (minor): 0
+CHECK-NEXT: Characteristics: 0
+CHECK-NEXT: Data size: 278
+CHECK-NEXT: Data: (
+CHECK-NEXT:   0000: 4C08CF92 00000000 05000000 00006A01  |L.............j.|
+CHECK-NEXT:   0010: F600FFFF 68000000 4D006100 6B006500  |....h...M.a.k.e.|
+CHECK-NEXT:   0020: 4E005300 49005300 57000000 08004D00  |N.S.I.S.W.....M.|
+CHECK-NEXT:   0030: 53002000 53006800 65006C00 6C002000  |S. .S.h.e.l.l. .|
+CHECK-NEXT:   0040: 44006C00 67000000 4489A150 00000000  |D.l.g...D..P....|
+CHECK-NEXT:   0050: 07001600 5C01BE00 CA005200 69006300  |....\.....R.i.c.|
+CHECK-NEXT:   0060: 68004500 64006900 74003200 30004100  |h.E.d.i.t.2.0.A.|
+CHECK-NEXT:   0070: 00000000 00000000 10000050 00000000  |...........P....|
+CHECK-NEXT:   0080: 0700DC00 5A010100 FFFF5300 74006100  |....Z.....S.t.a.|
+CHECK-NEXT:   0090: 74006900 63000000 00000000 00000258  |t.i.c..........X|
+CHECK-NEXT:   00A0: 00000000 0700E600 C8000C00 C800FFFF  |................|
+CHECK-NEXT:   00B0: 82000000 00000000 01000158 00000000  |...........X....|
+CHECK-NEXT:   00C0: E600E200 3C000F00 CB00FFFF 80005400  |....<.........T.|
+CHECK-NEXT:   00D0: 65007300 74002000 26004900 6E007300  |e.s.t. .&.I.n.s.|
+CHECK-NEXT:   00E0: 74006100 6C006C00 65007200 00000000  |t.a.l.l.e.r.....|
+CHECK-NEXT:   00F0: 00000150 00000000 2801E200 31000F00  |...P....(...1...|
+CHECK-NEXT:   0100: 0200FFFF 80002600 43006C00 6F007300  |......&.C.l.o.s.|
+CHECK-NEXT:   0110: 65000000 0000                        |e.....|
+CHECK-NEXT: )
diff --git a/llvm/tools/llvm-driver/CMakeLists.txt b/llvm/tools/llvm-driver/CMakeLists.txt
index 83e084069b96..82d85c723010 100644
--- a/llvm/tools/llvm-driver/CMakeLists.txt
+++ b/llvm/tools/llvm-driver/CMakeLists.txt
@@ -13,6 +13,11 @@ foreach(tool ${LLVM_DRIVER_TOOLS})
     string(REPLACE "llvm-" "" alias ${alias})
     set(def_decl "${def_decl}LLVM_DRIVER_TOOL(\"${alias}\", ${tool_entry})\n")
   endforeach()
+  get_property(hidden_tool_aliases GLOBAL PROPERTY LLVM_DRIVER_HIDDEN_TOOL_ALIASES_${tool})
+  foreach(alias ${hidden_tool_aliases})
+    string(REPLACE "llvm-" "" alias ${alias})
+    set(def_decl "${def_decl}LLVM_DRIVER_TOOL(\"${alias}\", ${tool_entry})\n")
+  endforeach()
 endforeach()
 
 file(WRITE
diff --git a/llvm/tools/llvm-mca/CodeRegion.h b/llvm/tools/llvm-mca/CodeRegion.h
index ce107fd8f3b6..5a2e8baa1f3e 100644
--- a/llvm/tools/llvm-mca/CodeRegion.h
+++ b/llvm/tools/llvm-mca/CodeRegion.h
@@ -59,6 +59,7 @@
 #define LLVM_TOOLS_LLVM_MCA_CODEREGION_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
@@ -97,6 +98,20 @@ public:
     Instructions.emplace_back(Instruction);
   }
 
+  // Remove the given instructions from the set, for unsupported instructions
+  // being skipped. Returns an ArrayRef for the updated vector of Instructions.
+  [[nodiscard]] llvm::ArrayRef<llvm::MCInst>
+  dropInstructions(const llvm::SmallPtrSetImpl<const llvm::MCInst *> &Insts) {
+    if (Insts.empty())
+      return Instructions;
+    Instructions.erase(std::remove_if(Instructions.begin(), Instructions.end(),
+                                      [&Insts](const llvm::MCInst &Inst) {
+                                        return Insts.contains(&Inst);
+                                      }),
+                       Instructions.end());
+    return Instructions;
+  }
+
   llvm::SMLoc startLoc() const { return RangeStart; }
   llvm::SMLoc endLoc() const { return RangeEnd; }
 
diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp
index eb71cffba6dd..e037c06b12a3 100644
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -237,6 +237,11 @@ static cl::opt<bool> DisableInstrumentManager(
              "ignores instruments.)."),
     cl::cat(ViewOptions), cl::init(false));
 
+static cl::opt<bool> SkipUnsupportedInstructions(
+    "skip-unsupported-instructions",
+    cl::desc("Make unsupported instruction errors into warnings."),
+    cl::cat(ViewOptions), cl::init(false));
+
 namespace {
 
 const Target *getTarget(const char *ProgName) {
@@ -558,6 +563,7 @@ int main(int argc, char **argv) {
   assert(MAB && "Unable to create asm backend!");
 
   json::Object JSONOutput;
+  int NonEmptyRegions = 0;
   for (const std::unique_ptr<mca::AnalysisRegion> &Region : Regions) {
     // Skip empty code regions.
     if (Region->empty())
@@ -571,14 +577,13 @@ int main(int argc, char **argv) {
 
     IPP->resetState();
 
-    DenseMap<const MCInst *, SmallVector<mca::Instrument *>>
-        InstToInstruments;
+    DenseMap<const MCInst *, SmallVector<mca::Instrument *>> InstToInstruments;
     SmallVector<std::unique_ptr<mca::Instruction>> LoweredSequence;
+    SmallPtrSet<const MCInst *, 16> DroppedInsts;
     for (const MCInst &MCI : Insts) {
       SMLoc Loc = MCI.getLoc();
       const SmallVector<mca::Instrument *> Instruments =
           InstrumentRegions.getActiveInstruments(Loc);
-      InstToInstruments.insert({&MCI, Instruments});
 
       Expected<std::unique_ptr<mca::Instruction>> Inst =
           IB.createInstruction(MCI, Instruments);
@@ -588,7 +593,15 @@ int main(int argc, char **argv) {
                 [&IP, &STI](const mca::InstructionError<MCInst> &IE) {
                   std::string InstructionStr;
                   raw_string_ostream SS(InstructionStr);
-                  WithColor::error() << IE.Message << '\n';
+                  if (SkipUnsupportedInstructions)
+                    WithColor::warning()
+                        << IE.Message
+                        << ", skipping with -skip-unsupported-instructions, "
+                           "note accuracy will be impacted:\n";
+                  else
+                    WithColor::error()
+                        << IE.Message
+                        << ", use -skip-unsupported-instructions to ignore.\n";
                   IP->printInst(&IE.Inst, 0, "", *STI, SS);
                   SS.flush();
                   WithColor::note()
@@ -597,14 +610,25 @@ int main(int argc, char **argv) {
           // Default case.
           WithColor::error() << toString(std::move(NewE));
         }
+        if (SkipUnsupportedInstructions) {
+          DroppedInsts.insert(&MCI);
+          continue;
+        }
         return 1;
       }
 
       IPP->postProcessInstruction(Inst.get(), MCI);
-
+      InstToInstruments.insert({&MCI, Instruments});
       LoweredSequence.emplace_back(std::move(Inst.get()));
     }
 
+    Insts = Region->dropInstructions(DroppedInsts);
+
+    // Skip empty regions.
+    if (Insts.empty())
+      continue;
+    NonEmptyRegions++;
+
     mca::CircularSourceMgr S(LoweredSequence,
                              PrintInstructionTables ? 1 : Iterations);
 
@@ -759,6 +783,11 @@ int main(int argc, char **argv) {
     ++RegionIdx;
   }
 
+  if (NonEmptyRegions == 0) {
+    WithColor::error() << "no assembly instructions found.\n";
+    return 1;
+  }
+
   if (PrintJson)
     TOF->os() << formatv("{0:2}", json::Value(std::move(JSONOutput))) << "\n";
 
diff --git a/llvm/tools/llvm-rc/ResourceFileWriter.cpp b/llvm/tools/llvm-rc/ResourceFileWriter.cpp
index d507525970ec..85b59532bb83 100644
--- a/llvm/tools/llvm-rc/ResourceFileWriter.cpp
+++ b/llvm/tools/llvm-rc/ResourceFileWriter.cpp
@@ -550,6 +550,11 @@ Error ResourceFileWriter::visitVersionStmt(const VersionStmt *Stmt) {
   return Error::success();
 }
 
+Error ResourceFileWriter::visitMenuStmt(const MenuStmt *Stmt) {
+  ObjectData.Menu = Stmt->Value;
+  return Error::success();
+}
+
 Error ResourceFileWriter::writeResource(
     const RCResource *Res,
     Error (ResourceFileWriter::*BodyWriter)(const RCResource *)) {
@@ -1132,9 +1137,8 @@ Error ResourceFileWriter::writeDialogBody(const RCResource *Base) {
            ulittle16_t(Res->Height)};
   writeObject(Middle);
 
-  // MENU field. As of now, we don't keep them in the state and can peacefully
-  // think there is no menu attached to the dialog.
-  writeInt<uint16_t>(0);
+  // MENU field.
+  RETURN_IF_ERROR(writeIntOrString(ObjectData.Menu));
 
   // Window CLASS field.
   RETURN_IF_ERROR(writeIntOrString(ObjectData.Class));
diff --git a/llvm/tools/llvm-rc/ResourceFileWriter.h b/llvm/tools/llvm-rc/ResourceFileWriter.h
index 9413a0eecdac..82d3e3b9e9e8 100644
--- a/llvm/tools/llvm-rc/ResourceFileWriter.h
+++ b/llvm/tools/llvm-rc/ResourceFileWriter.h
@@ -16,6 +16,7 @@
 #include "ResourceScriptStmt.h"
 #include "ResourceVisitor.h"
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Endian.h"
 
 namespace llvm {
@@ -68,6 +69,7 @@ public:
   Error visitLanguageStmt(const LanguageResource *) override;
   Error visitStyleStmt(const StyleStmt *) override;
   Error visitVersionStmt(const VersionStmt *) override;
+  Error visitMenuStmt(const MenuStmt *) override;
 
   // Stringtables are output at the end of .res file. We need a separate
   // function to do it.
@@ -92,10 +94,11 @@ public:
     };
     std::optional<FontInfo> Font;
     IntOrString Class;
+    IntOrString Menu;
 
     ObjectInfo()
         : LanguageInfo(0), Characteristics(0), VersionInfo(0),
-          Class(StringRef()) {}
+          Class(StringRef()), Menu(StringRef()) {}
   } ObjectData;
 
   struct StringTableInfo {
diff --git a/llvm/tools/llvm-rc/ResourceScriptParser.cpp b/llvm/tools/llvm-rc/ResourceScriptParser.cpp
index 4f02fa502d24..69798152c1f2 100644
--- a/llvm/tools/llvm-rc/ResourceScriptParser.cpp
+++ b/llvm/tools/llvm-rc/ResourceScriptParser.cpp
@@ -430,6 +430,8 @@ RCParser::parseSingleOptionalStatement(OptStmtType StmtsType) {
       return parseFontStmt(StmtsType);
     if (TypeToken->equals_insensitive("STYLE"))
       return parseStyleStmt();
+    if (TypeToken->equals_insensitive("MENU"))
+      return parseMenuStmt();
   }
 
   return getExpectedError("optional statement type, BEGIN or '{'",
@@ -965,6 +967,11 @@ RCParser::ParseOptionType RCParser::parseExStyleStmt() {
   return std::make_unique<ExStyleStmt>(*Arg);
 }
 
+RCParser::ParseOptionType RCParser::parseMenuStmt() {
+  ASSIGN_OR_RETURN(Arg, readIntOrString());
+  return std::make_unique<MenuStmt>(*Arg);
+}
+
 Error RCParser::getExpectedError(const Twine &Message, bool IsAlreadyRead) {
   return make_error<ParserError>(
       Message, IsAlreadyRead ? std::prev(CurLoc) : CurLoc, End);
diff --git a/llvm/tools/llvm-rc/ResourceScriptParser.h b/llvm/tools/llvm-rc/ResourceScriptParser.h
index 603afd8d73fb..aa7f847187c4 100644
--- a/llvm/tools/llvm-rc/ResourceScriptParser.h
+++ b/llvm/tools/llvm-rc/ResourceScriptParser.h
@@ -176,6 +176,7 @@ private:
   ParseOptionType parseExStyleStmt();
   ParseOptionType parseFontStmt(OptStmtType DialogType);
   ParseOptionType parseStyleStmt();
+  ParseOptionType parseMenuStmt();
 
   // Raises an error. If IsAlreadyRead = false (default), this complains about
   // the token that couldn't be parsed. If the flag is on, this complains about
diff --git a/llvm/tools/llvm-rc/ResourceScriptStmt.cpp b/llvm/tools/llvm-rc/ResourceScriptStmt.cpp
index 62df7999252f..a7f3df0863e7 100644
--- a/llvm/tools/llvm-rc/ResourceScriptStmt.cpp
+++ b/llvm/tools/llvm-rc/ResourceScriptStmt.cpp
@@ -309,5 +309,9 @@ raw_ostream &ExStyleStmt::log(raw_ostream &OS) const {
   return OS << "ExStyle: " << Value << "\n";
 }
 
+raw_ostream &MenuStmt::log(raw_ostream &OS) const {
+  return OS << "Menu: " << Value << "\n";
+}
+
 } // namespace rc
 } // namespace llvm
diff --git a/llvm/tools/llvm-rc/ResourceScriptStmt.h b/llvm/tools/llvm-rc/ResourceScriptStmt.h
index 70e7cec9cb84..05865e582859 100644
--- a/llvm/tools/llvm-rc/ResourceScriptStmt.h
+++ b/llvm/tools/llvm-rc/ResourceScriptStmt.h
@@ -993,6 +993,19 @@ public:
   Error visit(Visitor *V) const override { return V->visitExStyleStmt(this); }
 };
 
+// MENU optional statement.
+//
+// Ref: https://learn.microsoft.com/en-us/windows/win32/menurc/menu-statement
+class MenuStmt : public OptionalStmt {
+public:
+  IntOrString Value;
+
+  MenuStmt(IntOrString NameOrId) : Value(NameOrId) {}
+  raw_ostream &log(raw_ostream &) const override;
+  Twine getResourceTypeName() const override { return "MENU"; }
+  Error visit(Visitor *V) const override { return V->visitMenuStmt(this); }
+};
+
 // CLASS optional statement.
 //
 // Ref: msdn.microsoft.com/en-us/library/windows/desktop/aa380883(v=vs.85).aspx
diff --git a/llvm/tools/llvm-rc/ResourceVisitor.h b/llvm/tools/llvm-rc/ResourceVisitor.h
index a950cd7555ec..a121a0a507c2 100644
--- a/llvm/tools/llvm-rc/ResourceVisitor.h
+++ b/llvm/tools/llvm-rc/ResourceVisitor.h
@@ -28,6 +28,7 @@ class FontStmt;
 class LanguageResource;
 class StyleStmt;
 class VersionStmt;
+class MenuStmt;
 
 class Visitor {
 public:
@@ -52,6 +53,7 @@ public:
   virtual Error visitLanguageStmt(const LanguageResource *) = 0;
   virtual Error visitStyleStmt(const StyleStmt *) = 0;
   virtual Error visitVersionStmt(const VersionStmt *) = 0;
+  virtual Error visitMenuStmt(const MenuStmt *) = 0;
 
   virtual ~Visitor() {}
 };
diff --git a/llvm/unittests/BinaryFormat/CMakeLists.txt b/llvm/unittests/BinaryFormat/CMakeLists.txt
index f0c42a0dd02b..40d3bc4dca0b 100644
--- a/llvm/unittests/BinaryFormat/CMakeLists.txt
+++ b/llvm/unittests/BinaryFormat/CMakeLists.txt
@@ -5,6 +5,7 @@ set(LLVM_LINK_COMPONENTS
 
 add_llvm_unittest(BinaryFormatTests
   DwarfTest.cpp
+  ELFTest.cpp
   MachOTest.cpp
   MsgPackDocumentTest.cpp
   MsgPackReaderTest.cpp
diff --git a/llvm/unittests/BinaryFormat/ELFTest.cpp b/llvm/unittests/BinaryFormat/ELFTest.cpp
new file mode 100644
index 000000000000..5dbf6ff8d5c9
--- /dev/null
+++ b/llvm/unittests/BinaryFormat/ELFTest.cpp
@@ -0,0 +1,32 @@
+//===- ELFTest.cpp --------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/BinaryFormat/ELF.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::ELF;
+
+namespace {
+TEST(ELFTest, OSABI) {
+  EXPECT_EQ(ELFOSABI_GNU, convertNameToOSABI("gnu"));
+  EXPECT_EQ(ELFOSABI_FREEBSD, convertNameToOSABI("freebsd"));
+  EXPECT_EQ(ELFOSABI_STANDALONE, convertNameToOSABI("standalone"));
+  EXPECT_EQ(ELFOSABI_NONE, convertNameToOSABI("none"));
+  // Test unrecognized strings.
+  EXPECT_EQ(ELFOSABI_NONE, convertNameToOSABI(""));
+  EXPECT_EQ(ELFOSABI_NONE, convertNameToOSABI("linux"));
+
+  EXPECT_EQ("gnu", convertOSABIToName(ELFOSABI_GNU));
+  EXPECT_EQ("freebsd", convertOSABIToName(ELFOSABI_FREEBSD));
+  EXPECT_EQ("standalone", convertOSABIToName(ELFOSABI_STANDALONE));
+  EXPECT_EQ("none", convertOSABIToName(ELFOSABI_NONE));
+  // Test unrecognized values.
+  EXPECT_EQ("none", convertOSABIToName(0xfe));
+}
+} // namespace
diff --git a/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp b/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
index ef80eed8d180..34a36ba68d7c 100644
--- a/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
@@ -745,6 +745,120 @@ TEST_F(AArch64GISelMITest, TestNumSignBitsConstant) {
   EXPECT_EQ(3u, Info.computeNumSignBits(CopyRegNeg32));
 }
 
+TEST_F(AArch64GISelMITest, TestNumSignBitsXOR) {
+  StringRef MIRString = "  %c1:_(s8) = G_CONSTANT i8 1\n"
+                        "  %cn1:_(s8) = G_CONSTANT i8 -1\n"
+                        "  %c127:_(s8) = G_CONSTANT i8 127\n"
+                        "  %c32:_(s8) = G_CONSTANT i8 32\n"
+                        "  %cn32:_(s8) = G_CONSTANT i8 -32\n"
+
+                        "  %xor1:_(s8) = G_XOR %c1, %cn1\n"
+                        "  %Copy1:_(s8) = COPY %xor1\n"
+
+                        "  %xor2:_(s8) = G_XOR %c1, %c32\n"
+                        "  %Copy2:_(s8) = COPY %xor2\n"
+
+                        "  %xor3:_(s8) = G_XOR %c32, %c127\n"
+                        "  %Copy3:_(s8) = COPY %xor3\n"
+
+                        "  %xor4:_(s8) = G_XOR %cn32, %c127\n"
+                        "  %Copy4:_(s8) = COPY %xor4\n"
+
+                        "  %xor5:_(s8) = G_XOR %c127, %cn32\n"
+                        "  %Copy5:_(s8) = COPY %xor5\n";
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+  Register Copy1 = Copies[Copies.size() - 5];
+  Register Copy2 = Copies[Copies.size() - 4];
+  Register Copy3 = Copies[Copies.size() - 3];
+  Register Copy4 = Copies[Copies.size() - 2];
+  Register Copy5 = Copies[Copies.size() - 1];
+
+  GISelKnownBits Info(*MF);
+  EXPECT_EQ(7u, Info.computeNumSignBits(Copy1));
+  EXPECT_EQ(2u, Info.computeNumSignBits(Copy2));
+  EXPECT_EQ(1u, Info.computeNumSignBits(Copy3));
+  EXPECT_EQ(1u, Info.computeNumSignBits(Copy4));
+  EXPECT_EQ(1u, Info.computeNumSignBits(Copy5));
+}
+
+TEST_F(AArch64GISelMITest, TestNumSignBitsOR) {
+  StringRef MIRString = "  %c1:_(s8) = G_CONSTANT i8 1\n"
+                        "  %cn1:_(s8) = G_CONSTANT i8 -1\n"
+                        "  %c127:_(s8) = G_CONSTANT i8 127\n"
+                        "  %c32:_(s8) = G_CONSTANT i8 32\n"
+                        "  %cn32:_(s8) = G_CONSTANT i8 -32\n"
+
+                        "  %or1:_(s8) = G_OR %c1, %cn1\n"
+                        "  %Copy1:_(s8) = COPY %or1\n"
+
+                        "  %or2:_(s8) = G_OR %c1, %c32\n"
+                        "  %Copy2:_(s8) = COPY %or2\n"
+
+                        "  %or3:_(s8) = G_OR %c32, %c127\n"
+                        "  %Copy3:_(s8) = COPY %or3\n"
+
+                        "  %or4:_(s8) = G_OR %cn32, %c127\n"
+                        "  %Copy4:_(s8) = COPY %or4\n"
+
+                        "  %or5:_(s8) = G_OR %c127, %cn32\n"
+                        "  %Copy5:_(s8) = COPY %or5\n";
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+  Register Copy1 = Copies[Copies.size() - 5];
+  Register Copy2 = Copies[Copies.size() - 4];
+  Register Copy3 = Copies[Copies.size() - 3];
+  Register Copy4 = Copies[Copies.size() - 2];
+  Register Copy5 = Copies[Copies.size() - 1];
+
+  GISelKnownBits Info(*MF);
+  EXPECT_EQ(8u, Info.computeNumSignBits(Copy1));
+  EXPECT_EQ(2u, Info.computeNumSignBits(Copy2));
+  EXPECT_EQ(1u, Info.computeNumSignBits(Copy3));
+  EXPECT_EQ(8u, Info.computeNumSignBits(Copy4));
+  EXPECT_EQ(8u, Info.computeNumSignBits(Copy5));
+}
+
+TEST_F(AArch64GISelMITest, TestNumSignBitsAND) {
+  StringRef MIRString = "  %c1:_(s8) = G_CONSTANT i8 1\n"
+                        "  %cn1:_(s8) = G_CONSTANT i8 -1\n"
+                        "  %c127:_(s8) = G_CONSTANT i8 127\n"
+                        "  %c32:_(s8) = G_CONSTANT i8 32\n"
+                        "  %cn32:_(s8) = G_CONSTANT i8 -32\n"
+
+                        "  %and1:_(s8) = G_AND %c1, %cn1\n"
+                        "  %Copy1:_(s8) = COPY %and1\n"
+
+                        "  %and2:_(s8) = G_AND %c1, %c32\n"
+                        "  %Copy2:_(s8) = COPY %and2\n"
+
+                        "  %and3:_(s8) = G_AND %c32, %c127\n"
+                        "  %Copy3:_(s8) = COPY %and3\n"
+
+                        "  %and4:_(s8) = G_AND %cn32, %c127\n"
+                        "  %Copy4:_(s8) = COPY %and4\n"
+
+                        "  %and5:_(s8) = G_AND %c127, %cn32\n"
+                        "  %Copy5:_(s8) = COPY %and5\n";
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+  Register Copy1 = Copies[Copies.size() - 5];
+  Register Copy2 = Copies[Copies.size() - 4];
+  Register Copy3 = Copies[Copies.size() - 3];
+  Register Copy4 = Copies[Copies.size() - 2];
+  Register Copy5 = Copies[Copies.size() - 1];
+
+  GISelKnownBits Info(*MF);
+  EXPECT_EQ(7u, Info.computeNumSignBits(Copy1));
+  EXPECT_EQ(8u, Info.computeNumSignBits(Copy2));
+  EXPECT_EQ(2u, Info.computeNumSignBits(Copy3));
+  EXPECT_EQ(1u, Info.computeNumSignBits(Copy4));
+  EXPECT_EQ(1u, Info.computeNumSignBits(Copy5));
+}
+
 TEST_F(AArch64GISelMITest, TestNumSignBitsSext) {
   StringRef MIRString = "  %3:_(p0) = G_IMPLICIT_DEF\n"
                         "  %4:_(s8) = G_LOAD %3 :: (load (s8))\n"
diff --git a/llvm/unittests/Object/ELFObjectFileTest.cpp b/llvm/unittests/Object/ELFObjectFileTest.cpp
index c4d2b4ae8b9a..c13dc0e3fab8 100644
--- a/llvm/unittests/Object/ELFObjectFileTest.cpp
+++ b/llvm/unittests/Object/ELFObjectFileTest.cpp
@@ -1504,3 +1504,46 @@ Sections:
                "SHT_RELA section with index 1: failed to get a "
                "relocated section: invalid section index: 255");
 }
+
+TEST(ELFObjectFileTest, ELFSymbolRefLess) {
+  SmallString<0> Storage;
+  Expected<ELFObjectFile<ELF64LE>> ElfOrErr = toBinary<ELF64LE>(Storage, R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+)");
+
+  ASSERT_THAT_EXPECTED(ElfOrErr, Succeeded());
+  const ELFObjectFile<ELF64LE> &Obj = *ElfOrErr;
+
+  const uint32_t ValLow = 0x00000001;
+  const uint32_t ValHigh = 0x00000100;
+
+  auto MakeSymbol = [&Obj](size_t SymtabIndex, size_t SymbolIndex) {
+    DataRefImpl Data;
+    Data.d.a = SymtabIndex;
+    Data.d.b = SymbolIndex;
+    SymbolRef Sym(Data, &Obj);
+    return ELFSymbolRef(Sym);
+  };
+
+  ELFSymbolRef ELFSymLowLow = MakeSymbol(ValLow, ValLow);
+  ELFSymbolRef ELFSymLowHigh = MakeSymbol(ValLow, ValHigh);
+  ELFSymbolRef ELFSymHighLow = MakeSymbol(ValHigh, ValLow);
+  ELFSymbolRef ELFSymHighHigh = MakeSymbol(ValHigh, ValHigh);
+
+  EXPECT_TRUE(ELFSymLowLow < ELFSymLowHigh);
+  EXPECT_FALSE(ELFSymLowHigh < ELFSymLowLow);
+  EXPECT_FALSE(ELFSymLowLow < ELFSymLowLow);
+
+  EXPECT_TRUE(ELFSymLowLow < ELFSymHighHigh);
+  EXPECT_TRUE(ELFSymLowHigh < ELFSymHighLow);
+  EXPECT_TRUE(ELFSymLowLow < ELFSymHighLow);
+
+  EXPECT_FALSE(ELFSymHighLow < ELFSymLowHigh);
+  EXPECT_FALSE(ELFSymHighHigh < ELFSymLowLow);
+  EXPECT_FALSE(ELFSymHighLow < ELFSymLowLow);
+}
diff --git a/llvm/unittests/Support/YAMLIOTest.cpp b/llvm/unittests/Support/YAMLIOTest.cpp
index 401981f3841e..6ac0d1b412f0 100644
--- a/llvm/unittests/Support/YAMLIOTest.cpp
+++ b/llvm/unittests/Support/YAMLIOTest.cpp
@@ -2906,6 +2906,87 @@ TEST(YAMLIO, Numeric) {
 }
 
 //===----------------------------------------------------------------------===//
+//  Test writing and reading escaped keys
+//===----------------------------------------------------------------------===//
+
+// Struct with dynamic string key
+struct QuotedKeyStruct {
+  int unquoted_bool;
+  int unquoted_null;
+  int unquoted_numeric;
+  int unquoted_str;
+  int colon;
+  int just_space;
+  int unprintable;
+};
+
+namespace llvm {
+namespace yaml {
+template <> struct MappingTraits<QuotedKeyStruct> {
+  static void mapping(IO &io, QuotedKeyStruct &map) {
+    io.mapRequired("true", map.unquoted_bool);
+    io.mapRequired("null", map.unquoted_null);
+    io.mapRequired("42", map.unquoted_numeric);
+    io.mapRequired("unquoted", map.unquoted_str);
+    io.mapRequired(":", map.colon);
+    io.mapRequired(" ", map.just_space);
+    char unprintableKey[] = {/* \f, form-feed */ 0xC, 0};
+    io.mapRequired(unprintableKey, map.unprintable);
+  }
+};
+} // namespace yaml
+} // namespace llvm
+
+TEST(YAMLIO, TestQuotedKeyRead) {
+  QuotedKeyStruct map = {};
+  Input yin("---\ntrue:  1\nnull:  2\n42:  3\nunquoted:  4\n':':  5\n' ':  "
+            "6\n\"\\f\":  7\n...\n");
+  yin >> map;
+
+  EXPECT_FALSE(yin.error());
+  EXPECT_EQ(map.unquoted_bool, 1);
+  EXPECT_EQ(map.unquoted_null, 2);
+  EXPECT_EQ(map.unquoted_numeric, 3);
+  EXPECT_EQ(map.unquoted_str, 4);
+  EXPECT_EQ(map.colon, 5);
+  EXPECT_EQ(map.just_space, 6);
+  EXPECT_EQ(map.unprintable, 7);
+}
+
+TEST(YAMLIO, TestQuotedKeyWriteRead) {
+  std::string intermediate;
+  {
+    QuotedKeyStruct map = {1, 2, 3, 4, 5, 6, 7};
+    llvm::raw_string_ostream ostr(intermediate);
+    Output yout(ostr);
+    yout << map;
+  }
+
+  EXPECT_NE(std::string::npos, intermediate.find("true:"));
+  EXPECT_NE(std::string::npos, intermediate.find("null:"));
+  EXPECT_NE(std::string::npos, intermediate.find("42:"));
+  EXPECT_NE(std::string::npos, intermediate.find("unquoted:"));
+  EXPECT_NE(std::string::npos, intermediate.find("':':"));
+  EXPECT_NE(std::string::npos, intermediate.find("' '"));
+  EXPECT_NE(std::string::npos, intermediate.find("\"\\f\":"));
+
+  {
+    Input yin(intermediate);
+    QuotedKeyStruct map;
+    yin >> map;
+
+    EXPECT_FALSE(yin.error());
+    EXPECT_EQ(map.unquoted_bool, 1);
+    EXPECT_EQ(map.unquoted_null, 2);
+    EXPECT_EQ(map.unquoted_numeric, 3);
+    EXPECT_EQ(map.unquoted_str, 4);
+    EXPECT_EQ(map.colon, 5);
+    EXPECT_EQ(map.just_space, 6);
+    EXPECT_EQ(map.unprintable, 7);
+  }
+}
+
+//===----------------------------------------------------------------------===//
 //  Test PolymorphicTraits and TaggedScalarTraits
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index c945c4fbcf63..9f23000d733d 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -120,10 +120,14 @@ TEST(ParseArchString, RejectsInvalidBaseISA) {
     EXPECT_EQ(toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
               "string must begin with rv32{i,e,g} or rv64{i,e,g}");
   }
-  for (StringRef Input : {"rv32j", "rv64k", "rv32_i"}) {
+
+  for (StringRef Input : {"rv32j", "rv32_i"}) {
     EXPECT_EQ(toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
-              "first letter should be 'e', 'i' or 'g'");
+              "first letter after 'rv32' should be 'e', 'i' or 'g'");
   }
+
+  EXPECT_EQ(toString(RISCVISAInfo::parseArchString("rv64k", true).takeError()),
+            "first letter after 'rv64' should be 'e', 'i' or 'g'");
 }
 
 TEST(ParseArchString, RejectsUnsupportedBaseISA) {
@@ -395,7 +399,7 @@ TEST(ParseArchString, AcceptsAmbiguousFromRelaxExtensions) {
 TEST(ParseArchString, RejectsRelaxExtensionsNotStartWithEorIorG) {
   EXPECT_EQ(
       toString(RISCVISAInfo::parseArchString("rv32zba_im", true).takeError()),
-      "first letter should be 'e', 'i' or 'g'");
+      "first letter after 'rv32' should be 'e', 'i' or 'g'");
 }
 
 TEST(ParseArchString,
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index 78abf80e7aec..cf7e4398741c 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -45,12 +45,10 @@
 #include "llvm/Support/CodeGenCoverage.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/SaveAndRestore.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
-#include <numeric>
 #include <string>
 
 using namespace llvm;
@@ -792,8 +790,8 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
                "nested predicate that uses operands");
         TreePattern *TP = Predicate.getOrigPatFragRecord();
         WaitingForNamedOperands = TP->getNumArgs();
-        for (unsigned i = 0; i < WaitingForNamedOperands; ++i)
-          StoreIdxForName[getScopedName(Call.Scope, TP->getArgName(i))] = i;
+        for (unsigned I = 0; I < WaitingForNamedOperands; ++I)
+          StoreIdxForName[getScopedName(Call.Scope, TP->getArgName(I))] = I;
       }
       InsnMatcher.addPredicate<GenericInstructionPredicateMatcher>(Predicate);
       continue;
@@ -834,6 +832,11 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
       return InsnMatcher;
     }
 
+    if (SrcGIOrNull->TheDef->getName() == "G_FRAME_INDEX") {
+      InsnMatcher.addOperand(OpIdx++, Src.getName(), TempOpIdx);
+      return InsnMatcher;
+    }
+
     // Special case because the operand order is changed from setcc. The
     // predicate operand needs to be swapped from the last operand to the first
     // source.
@@ -873,8 +876,8 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
     if (IsIntrinsic && !II)
       return failedImport("Expected IntInit containing intrinsic ID)");
 
-    for (unsigned i = 0; i != NumChildren; ++i) {
-      const TreePatternNode &SrcChild = Src.getChild(i);
+    for (unsigned I = 0; I != NumChildren; ++I) {
+      const TreePatternNode &SrcChild = Src.getChild(I);
 
       // We need to determine the meaning of a literal integer based on the
       // context. If this is a field required to be an immediate (such as an
@@ -883,19 +886,19 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
       // argument that is required to be an immediate, we should not emit an LLT
       // type check, and should not be looking for a G_CONSTANT defined
       // register.
-      bool OperandIsImmArg = SrcGIOrNull->isInOperandImmArg(i);
+      bool OperandIsImmArg = SrcGIOrNull->isInOperandImmArg(I);
 
       // SelectionDAG allows pointers to be represented with iN since it doesn't
       // distinguish between pointers and integers but they are different types
       // in GlobalISel. Coerce integers to pointers to address space 0 if the
       // context indicates a pointer.
       //
-      bool OperandIsAPointer = SrcGIOrNull->isInOperandAPointer(i);
+      bool OperandIsAPointer = SrcGIOrNull->isInOperandAPointer(I);
 
       if (IsIntrinsic) {
         // For G_INTRINSIC/G_INTRINSIC_W_SIDE_EFFECTS, the operand immediately
         // following the defs is an intrinsic ID.
-        if (i == 0) {
+        if (I == 0) {
           OperandMatcher &OM =
               InsnMatcher.addOperand(OpIdx++, SrcChild.getName(), TempOpIdx);
           OM.addPredicate<IntrinsicIDOperandMatcher>(II);
@@ -906,8 +909,8 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
         //
         // Note that we have to look at the i-1th parameter, because we don't
         // have the intrinsic ID in the intrinsic's parameter list.
-        OperandIsAPointer |= II->isParamAPointer(i - 1);
-        OperandIsImmArg |= II->isParamImmArg(i - 1);
+        OperandIsAPointer |= II->isParamAPointer(I - 1);
+        OperandIsImmArg |= II->isParamImmArg(I - 1);
       }
 
       if (auto Error =
@@ -962,9 +965,9 @@ Error GlobalISelEmitter::importChildMatcher(
     // The "name" of a non-leaf complex pattern (MY_PAT $op1, $op2) is
     // "MY_PAT:op1:op2" and the ones with same "name" represent same operand.
     std::string PatternName = std::string(SrcChild.getOperator()->getName());
-    for (unsigned i = 0; i < SrcChild.getNumChildren(); ++i) {
+    for (unsigned I = 0; I < SrcChild.getNumChildren(); ++I) {
       PatternName += ":";
-      PatternName += SrcChild.getChild(i).getName();
+      PatternName += SrcChild.getChild(I).getName();
     }
     SrcChildName = PatternName;
   }
@@ -1037,11 +1040,11 @@ Error GlobalISelEmitter::importChildMatcher(
               OM, SrcChild.getOperator(), TempOpIdx))
         return Error;
 
-      for (unsigned i = 0, e = SrcChild.getNumChildren(); i != e; ++i) {
-        auto &SubOperand = SrcChild.getChild(i);
+      for (unsigned I = 0, E = SrcChild.getNumChildren(); I != E; ++I) {
+        auto &SubOperand = SrcChild.getChild(I);
         if (!SubOperand.getName().empty()) {
           if (auto Error = Rule.defineComplexSubOperand(
-                  SubOperand.getName(), SrcChild.getOperator(), RendererID, i,
+                  SubOperand.getName(), SrcChild.getOperator(), RendererID, I,
                   SrcChildName))
             return Error;
         }
@@ -1223,10 +1226,16 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
     if (DstChild.getOperator()->getName() == "timm") {
       DstMIBuilder.addRenderer<CopyRenderer>(DstChild.getName());
       return InsertPt;
-    } else if (DstChild.getOperator()->getName() == "imm") {
+    }
+    if (DstChild.getOperator()->getName() == "tframeindex") {
+      DstMIBuilder.addRenderer<CopyRenderer>(DstChild.getName());
+      return InsertPt;
+    }
+    if (DstChild.getOperator()->getName() == "imm") {
       DstMIBuilder.addRenderer<CopyConstantAsImmRenderer>(DstChild.getName());
       return InsertPt;
-    } else if (DstChild.getOperator()->getName() == "fpimm") {
+    }
+    if (DstChild.getOperator()->getName() == "fpimm") {
       DstMIBuilder.addRenderer<CopyFConstantAsFPImmRenderer>(
           DstChild.getName());
       return InsertPt;
@@ -1739,7 +1748,7 @@ Error GlobalISelEmitter::importDefaultOperandRenderers(
 
     if (const DefInit *DefaultDefOp = dyn_cast<DefInit>(DefaultOp)) {
       std::optional<LLTCodeGen> OpTyOrNone = MVTToLLT(N.getSimpleType(0));
-      auto Def = DefaultDefOp->getDef();
+      auto *Def = DefaultDefOp->getDef();
       if (Def->getName() == "undef_tied_input") {
         unsigned TempRegID = M.allocateTempRegID();
         M.insertAction<MakeTempRegisterAction>(InsertPt, *OpTyOrNone,
@@ -2440,13 +2449,13 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
   }
 
   // Comparison function to order records by name.
-  auto orderByName = [](const Record *A, const Record *B) {
+  auto OrderByName = [](const Record *A, const Record *B) {
     return A->getName() < B->getName();
   };
 
   std::vector<Record *> ComplexPredicates =
       RK.getAllDerivedDefinitions("GIComplexOperandMatcher");
-  llvm::sort(ComplexPredicates, orderByName);
+  llvm::sort(ComplexPredicates, OrderByName);
 
   std::vector<StringRef> CustomRendererFns;
   transform(RK.getAllDerivedDefinitions("GICustomOperandRenderer"),
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
index a52cca3c95de..759cbe6c1564 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
@@ -1060,8 +1060,8 @@ def LLVM_vector_extract
   }];
 }
 
-def LLVM_experimental_vector_interleave2
-    : LLVM_OneResultIntrOp<"experimental.vector.interleave2",
+def LLVM_vector_interleave2
+    : LLVM_OneResultIntrOp<"vector.interleave2",
         /*overloadedResults=*/[0], /*overloadedOperands=*/[],
         /*traits=*/[
           Pure, AllTypesMatch<["vec1", "vec2"]>,
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgEnums.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgEnums.td
index 7a350d2c0142..6b4b073fc672 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgEnums.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgEnums.td
@@ -28,7 +28,8 @@ def UnaryFn : I32EnumAttr<"UnaryFn", "", [
   I32EnumAttrCase<"sqrt", 8>,
   I32EnumAttrCase<"rsqrt", 9>,
   I32EnumAttrCase<"square", 10>,
-  I32EnumAttrCase<"tanh", 11>
+  I32EnumAttrCase<"tanh", 11>,
+  I32EnumAttrCase<"erf", 12>
 ]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::linalg";
@@ -42,7 +43,8 @@ def BinaryFn : I32EnumAttr<"BinaryFn", "", [
   I32EnumAttrCase<"max_signed", 5>,
   I32EnumAttrCase<"min_signed", 6>,
   I32EnumAttrCase<"max_unsigned", 7>,
-  I32EnumAttrCase<"min_unsigned", 8>
+  I32EnumAttrCase<"min_unsigned", 8>,
+  I32EnumAttrCase<"powf", 9>
 ]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::linalg";
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
index b75675773475..584bfcd8b59d 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
@@ -515,6 +515,41 @@ structured_op: !LinalgStructuredOpConfig
           scalar_arg: I
 --- !LinalgOpConfig
 metadata: !LinalgOpMetadata
+  name: erf
+  cpp_class_name: erfOp
+  doc: |-
+    Applies erf(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+  iterator_types: []
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: unary
+        fn_name: erf
+        operands:
+        - !ScalarExpression
+          scalar_arg: I
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
   name: elemwise_binary
   cpp_class_name: ElemwiseBinaryOp
   doc: |-
@@ -923,6 +958,57 @@ structured_op: !LinalgStructuredOpConfig
           scalar_arg: rhs
 --- !LinalgOpConfig
 metadata: !LinalgOpMetadata
+  name: powf
+  cpp_class_name: PowFOp
+  doc: |-
+    Takes the powf(lhs, rhs) between two inputs, elementwise. For powf(arg, 2) use `linalg.square`.
+
+    Only applies to floating point values.
+
+    The shapes and element types must be identical. The appropriate casts,
+    broadcasts and reductions should be done previously to calling this op.
+
+    This means reduction/broadcast/element cast semantics is explicit. Further
+    passes can take that into account when lowering this code. For example,
+    a `linalg.broadcast` + `linalg.powf` sequence can be lowered to a
+    `linalg.generic` with different affine maps for the two operands.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: lhs
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: rhs
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+  iterator_types: []
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: binary
+        fn_name: powf
+        operands:
+        - !ScalarExpression
+          scalar_arg: lhs
+        - !ScalarExpression
+          scalar_arg: rhs
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
   name: matmul
   cpp_class_name: MatmulOp
   doc: |-
diff --git a/mlir/include/mlir/Tools/lsp-server-support/Transport.h b/mlir/include/mlir/Tools/lsp-server-support/Transport.h
index ce742be7a941..047d174234df 100644
--- a/mlir/include/mlir/Tools/lsp-server-support/Transport.h
+++ b/mlir/include/mlir/Tools/lsp-server-support/Transport.h
@@ -15,6 +15,7 @@
 #ifndef MLIR_TOOLS_LSPSERVERSUPPORT_TRANSPORT_H
 #define MLIR_TOOLS_LSPSERVERSUPPORT_TRANSPORT_H
 
+#include "mlir/Support/DebugStringHelper.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Tools/lsp-server-support/Logging.h"
@@ -100,6 +101,18 @@ using Callback = llvm::unique_function<void(llvm::Expected<T>)>;
 template <typename T>
 using OutgoingNotification = llvm::unique_function<void(const T &)>;
 
+/// An OutgoingRequest<T> is a function used for outgoing requests to send to
+/// the client.
+template <typename T>
+using OutgoingRequest =
+    llvm::unique_function<void(const T &, llvm::json::Value id)>;
+
+/// An `OutgoingRequestCallback` is invoked when an outgoing request to the
+/// client receives a response in turn. It is passed the original request's ID,
+/// as well as the result JSON.
+using OutgoingRequestCallback =
+    std::function<void(llvm::json::Value, llvm::Expected<llvm::json::Value>)>;
+
 /// A handler used to process the incoming transport messages.
 class MessageHandler {
 public:
@@ -147,9 +160,15 @@ public:
                     void (ThisT::*handler)(const Param &)) {
     notificationHandlers[method] = [method, handler,
                                     thisPtr](llvm::json::Value rawParams) {
-      llvm::Expected<Param> param = parse<Param>(rawParams, method, "request");
-      if (!param)
-        return llvm::consumeError(param.takeError());
+      llvm::Expected<Param> param =
+          parse<Param>(rawParams, method, "notification");
+      if (!param) {
+        return llvm::consumeError(
+            llvm::handleErrors(param.takeError(), [](const LSPError &lspError) {
+              Logger::error("JSON parsing error: {0}",
+                            lspError.message.c_str());
+            }));
+      }
       (thisPtr->*handler)(*param);
     };
   }
@@ -164,6 +183,26 @@ public:
     };
   }
 
+  /// Create an OutgoingRequest function that, when called, sends a request with
+  /// the given method via the transport. Should the outgoing request be
+  /// met with a response, the response callback is invoked to handle that
+  /// response.
+  template <typename T>
+  OutgoingRequest<T> outgoingRequest(llvm::StringLiteral method,
+                                     OutgoingRequestCallback callback) {
+    return [&, method, callback](const T &params, llvm::json::Value id) {
+      {
+        std::lock_guard<std::mutex> lock(responseHandlersMutex);
+        responseHandlers.insert(
+            {debugString(id), std::make_pair(method.str(), callback)});
+      }
+
+      std::lock_guard<std::mutex> transportLock(transportOutputMutex);
+      Logger::info("--> {0}({1})", method, id);
+      transport.call(method, llvm::json::Value(params), id);
+    };
+  }
+
 private:
   template <typename HandlerT>
   using HandlerMap = llvm::StringMap<llvm::unique_function<HandlerT>>;
@@ -172,6 +211,14 @@ private:
   HandlerMap<void(llvm::json::Value, Callback<llvm::json::Value>)>
       methodHandlers;
 
+  /// A pair of (1) the original request's method name, and (2) the callback
+  /// function to be invoked for responses.
+  using ResponseHandlerTy = std::pair<std::string, OutgoingRequestCallback>;
+  /// A mapping from request/response ID to response handler.
+  llvm::StringMap<ResponseHandlerTy> responseHandlers;
+  /// Mutex to guard insertion into the response handler map.
+  std::mutex responseHandlersMutex;
+
   JSONTransport &transport;
 
   /// Mutex to guard sending output messages to the transport.
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 1b9975237c69..fe6bcc1c8b66 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1738,7 +1738,7 @@ struct VectorInterleaveOpLowering
                                          "InterleaveOp not rank 1");
     // If the result is rank 1, then this directly maps to LLVM.
     if (resultType.isScalable()) {
-      rewriter.replaceOpWithNewOp<LLVM::experimental_vector_interleave2>(
+      rewriter.replaceOpWithNewOp<LLVM::vector_interleave2>(
           interleaveOp, typeConverter->convertType(resultType),
           adaptor.getLhs(), adaptor.getRhs());
       return success();
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp b/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp
index d3751d4ba7e7..39292c4533d6 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp
@@ -86,8 +86,7 @@ static Value createInterleave2Intrinsic(RewriterBase &rewriter, Location loc,
   auto inputType = cast<VectorType>(lhs.getType());
   VectorType inputTypeX2 =
       VectorType::Builder(inputType).setDim(0, inputType.getShape()[0] * 2);
-  return rewriter.create<LLVM::experimental_vector_interleave2>(
-      loc, inputTypeX2, lhs, rhs);
+  return rewriter.create<LLVM::vector_interleave2>(loc, inputTypeX2, lhs, rhs);
 }
 
 // Fuse two 'arm_sme.outerproduct' operations that are chained via the
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
index 531016130d1d..2d329a1f3d88 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
@@ -1382,14 +1382,27 @@ LogicalResult
 bufferization::runOneShotBufferize(Operation *op,
                                    const OneShotBufferizationOptions &options,
                                    BufferizationStatistics *statistics) {
+  // copy-before-write deactivates the analysis. It cannot be used together with
+  // test-analysis-only.
   assert(!(options.copyBeforeWrite && options.testAnalysisOnly) &&
          "invalid combination of bufferization flags");
-  if (!options.copyBeforeWrite) {
-    // If a buffer is copied before every write, no analysis is needed.
+
+  if (options.copyBeforeWrite) {
+    // Copy buffer before each write. No analysis is needed.
+  } else {
+    // Run One-Shot Analysis and insert buffer copies (on the tensor level)
+    // only where needed. This is the default and much more efficient than
+    // copy-before-write.
     if (failed(insertTensorCopies(op, options, statistics)))
       return failure();
+
+    // If test-analysis-only is set, the IR was annotated with RaW conflict
+    // markers (attributes) during One-Shot Analysis.
+    if (options.testAnalysisOnly)
+      return success();
   }
-  if (options.testAnalysisOnly)
-    return success();
+
+  // Bufferize the op and its nested ops. If options.copyBeforeWrite is set,
+  // a new buffer copy is allocated every time a buffer is written to.
   return bufferizeOp(op, options, statistics);
 }
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index 2436113dc423..f5e80553ae72 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -241,24 +241,26 @@ static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
     map.map(operand.value(), entryBlock.getArgument(operand.index()));
 
   // Clone the region of the gpu.launch operation into the gpu.func operation.
-  // TODO: If cloneInto can be modified such that if a mapping for
-  // a block exists, that block will be used to clone operations into (at the
-  // end of the block), instead of creating a new block, this would be much
-  // cleaner.
   launchOpBody.cloneInto(&outlinedFuncBody, map);
 
-  // Branch from entry of the gpu.func operation to the block that is cloned
-  // from the entry block of the gpu.launch operation.
-  Block &launchOpEntry = launchOpBody.front();
-  Block *clonedLaunchOpEntry = map.lookup(&launchOpEntry);
-  builder.setInsertionPointToEnd(&entryBlock);
-  builder.create<cf::BranchOp>(loc, clonedLaunchOpEntry);
-
-  outlinedFunc.walk([](gpu::TerminatorOp op) {
-    OpBuilder replacer(op);
-    replacer.create<gpu::ReturnOp>(op.getLoc());
-    op.erase();
-  });
+  // Replace the terminator op with returns.
+  for (Block &block : launchOpBody) {
+    Block *clonedBlock = map.lookup(&block);
+    auto terminator = dyn_cast<gpu::TerminatorOp>(clonedBlock->getTerminator());
+    if (!terminator)
+      continue;
+    OpBuilder replacer(terminator);
+    replacer.create<gpu::ReturnOp>(terminator->getLoc());
+    terminator->erase();
+  }
+
+  // Splice now the entry block of the gpu.launch operation at the end of the
+  // gpu.func entry block and erase the redundant block.
+  Block *clonedLaunchOpEntry = map.lookup(&launchOpBody.front());
+  entryBlock.getOperations().splice(entryBlock.getOperations().end(),
+                                    clonedLaunchOpEntry->getOperations());
+  clonedLaunchOpEntry->erase();
+
   return outlinedFunc;
 }
 
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 5d10b59373ad..036005ce9d92 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -411,6 +411,8 @@ public:
       return builder.create<arith::MulFOp>(arg.getLoc(), arg, arg);
     case UnaryFn::tanh:
       return builder.create<math::TanhOp>(arg.getLoc(), arg);
+    case UnaryFn::erf:
+      return builder.create<math::ErfOp>(arg.getLoc(), arg);
     }
     llvm_unreachable("unsupported unary function");
   }
@@ -483,6 +485,9 @@ public:
       if (allFloatingPoint)
         return builder.create<arith::MinimumFOp>(arg0.getLoc(), arg0, arg1);
       return builder.create<arith::MinUIOp>(arg0.getLoc(), arg0, arg1);
+    case BinaryFn::powf:
+      assert(allFloatingPoint);
+      return builder.create<math::PowFOp>(arg0.getLoc(), arg0, arg1);
     }
     llvm_unreachable("unsupported binary function");
   }
diff --git a/mlir/lib/Dialect/Polynomial/IR/CMakeLists.txt b/mlir/lib/Dialect/Polynomial/IR/CMakeLists.txt
index 7f5b3255d5d9..d6e703b8b359 100644
--- a/mlir/lib/Dialect/Polynomial/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/Polynomial/IR/CMakeLists.txt
@@ -16,4 +16,5 @@ add_mlir_dialect_library(MLIRPolynomialDialect
   MLIRSupport
   MLIRDialect
   MLIRIR
+  MLIRInferTypeOpInterface
   )
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
index 59c3e49264db..34312df91299 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
@@ -222,7 +222,7 @@ public:
   ///
   SmallVector<Value> getValPosits(TensorId tid) const {
     SmallVector<Value> batchCrds = iters[tid].back().back()->getBatchCrds();
-    Value lastLvlPos = iters[tid].back().back()->getCurPosition().first;
+    Value lastLvlPos = iters[tid].back().back()->getCurPosition().front();
     batchCrds.push_back(lastLvlPos);
     return batchCrds;
   };
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp
index 60dca3c55dec..745c081247de 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp
@@ -94,8 +94,10 @@ public:
 
   ValueRange getLvlBuffers() const override { return {}; }
 
-  ValuePair peekRangeAt(OpBuilder &b, Location l, ValueRange, Value p,
-                        Value max) const override {
+  ValuePair peekRangeAt(OpBuilder &b, Location l, ValueRange batchPrefix,
+                        ValueRange parentPos) const override {
+    assert(parentPos.size() == 1 && "Dense level can not be non-unique.");
+    Value p = parentPos.front();
     Value posLo = MULI(p, lvlSize);
     return {posLo, lvlSize};
   }
@@ -112,9 +114,9 @@ public:
 
   ValueRange getLvlBuffers() const override { return {}; }
 
-  ValuePair peekRangeAt(OpBuilder &b, Location l, ValueRange, Value p,
-                        Value max) const override {
-    assert(max == nullptr && "Dense level can not be non-unique.");
+  ValuePair peekRangeAt(OpBuilder &b, Location l, ValueRange,
+                        ValueRange parentPos) const override {
+    assert(parentPos.size() == 1 && "Dense level can not be non-unique.");
     // No need to linearize the position for non-annotated tensors.
     return {C_IDX(0), lvlSize};
   }
@@ -127,9 +129,11 @@ public:
       : SparseLevel(tid, lvl, lt, lvlSize, {posBuffer, crdBuffer}) {}
 
   ValuePair peekRangeAt(OpBuilder &b, Location l, ValueRange batchPrefix,
-                        Value p, Value max) const override {
-    assert(max == nullptr &&
+                        ValueRange parentPos) const override {
+
+    assert(parentPos.size() == 1 &&
            "compressed level must be the first non-unique level.");
+    Value p = parentPos.front();
 
     SmallVector<Value> memCrd(batchPrefix);
     memCrd.push_back(p);
@@ -147,11 +151,11 @@ public:
       : SparseLevel(tid, lvl, lt, lvlSize, {posBuffer, crdBuffer}) {}
 
   ValuePair peekRangeAt(OpBuilder &b, Location l, ValueRange batchPrefix,
-                        Value p, Value max) const override {
-    assert(max == nullptr &&
+                        ValueRange parentPos) const override {
+    assert(parentPos.size() == 1 &&
            "loose-compressed level must be the first non-unique level.");
     SmallVector<Value> memCrd(batchPrefix);
-
+    Value p = parentPos.front();
     p = MULI(p, C_IDX(2));
     memCrd.push_back(p);
     Value pLo = genIndexLoad(b, l, getPosBuf(), memCrd);
@@ -168,10 +172,13 @@ public:
       : SparseLevel(tid, lvl, lt, lvlSize, {crdBuffer}) {}
 
   ValuePair peekRangeAt(OpBuilder &b, Location l, ValueRange batchPrefix,
-                        Value p, Value segHi) const override {
+                        ValueRange parentPos) const override {
+    assert(parentPos.size() == 1 || parentPos.size() == 2);
+    Value p = parentPos.front();
+    Value segHi = parentPos.size() == 2 ? parentPos.back() : nullptr;
+
     if (segHi == nullptr)
       return {p, ADDI(p, C_IDX(1))};
-
     // Use the segHi as the loop upper bound.
     return {p, segHi};
   }
@@ -184,11 +191,12 @@ public:
       : SparseLevel(tid, lvl, lt, lvlSize, {crdBuffer}) {}
 
   ValuePair peekRangeAt(OpBuilder &b, Location l, ValueRange batchPrefix,
-                        Value p, Value max) const override {
-    assert(max == nullptr && isUnique() && "n:m level can not be non-unique.");
+                        ValueRange parentPos) const override {
+    assert(parentPos.size() == 1 && isUnique() &&
+           "n:m level can not be non-unique.");
     // Each n:m blk has exactly n specified elements.
     auto n = getN(lt);
-    Value posLo = MULI(p, C_IDX(n));
+    Value posLo = MULI(parentPos.front(), C_IDX(n));
     return {posLo, ADDI(posLo, C_IDX(n))};
   }
 };
@@ -316,23 +324,21 @@ public:
       posHi = vs.back();
   };
 
-  ValuePair getCurPosition() const override { return {getItPos(), nullptr}; }
-
   void genInitImpl(OpBuilder &b, Location l,
                    const SparseIterator *parent) override {
 
     if (isBatchIterator() && batchCrds.size() <= stl.lvl)
       batchCrds.resize(stl.lvl + 1, nullptr);
 
-    Value pos = C_IDX(0);
-    Value hi = nullptr;
+    Value c0 = C_IDX(0);
+    ValueRange pPos = c0;
     // If the parent iterator is a batch iterator, we also start from 0 (but
     // on a different batch).
     if (parent && !parent->isBatchIterator())
-      std::tie(pos, hi) = parent->getCurPosition();
+      pPos = parent->getCurPosition();
 
     ValueRange batchPrefix = parent ? parent->getBatchCrds() : ValueRange{};
-    std::tie(posLo, posHi) = stl.peekRangeAt(b, l, batchPrefix, pos, hi);
+    std::tie(posLo, posHi) = stl.peekRangeAt(b, l, batchPrefix, pPos);
     // Seek to the lowest position.
     seek(posLo);
   }
@@ -406,21 +412,19 @@ public:
     return {b.getIndexType(), b.getIndexType()};
   }
 
-  ValuePair getCurPosition() const override { return {getPos(), getSegHi()}; }
-
   void genInitImpl(OpBuilder &b, Location l,
                    const SparseIterator *parent) override {
+    Value c0 = C_IDX(0);
+    ValueRange pPos = c0;
 
-    Value pos = C_IDX(0);
-    Value hi = nullptr;
     // If the parent iterator is a batch iterator, we also start from 0 (but
     // on a different batch).
     if (parent && !parent->isBatchIterator())
-      std::tie(pos, hi) = parent->getCurPosition();
+      pPos = parent->getCurPosition();
 
     Value posLo;
     ValueRange batchPrefix = parent ? parent->getBatchCrds() : ValueRange{};
-    std::tie(posLo, posHi) = stl.peekRangeAt(b, l, batchPrefix, pos, hi);
+    std::tie(posLo, posHi) = stl.peekRangeAt(b, l, batchPrefix, pPos);
 
     seek({posLo, genSegmentHigh(b, l, posLo)});
   }
@@ -505,7 +509,7 @@ public:
 
   SmallVector<Value> serialize() const override { return wrap->serialize(); };
   void deserialize(ValueRange vs) override { wrap->deserialize(vs); };
-  ValuePair getCurPosition() const override { return wrap->getCurPosition(); }
+  ValueRange getCurPosition() const override { return wrap->getCurPosition(); }
 
   void genInitImpl(OpBuilder &b, Location l,
                    const SparseIterator *parent) override {
@@ -756,9 +760,8 @@ public:
   Value upperBound(OpBuilder &b, Location l) const override {
     return subSect.subSectSz;
   }
-  std::pair<Value, Value> getCurPosition() const override {
-    return wrap->getCurPosition();
-  };
+
+  ValueRange getCurPosition() const override { return wrap->getCurPosition(); };
 
   Value getNxLvlTupleId(OpBuilder &b, Location l) const {
     if (randomAccessible()) {
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h
index 46b923250dd8..b692848ec67b 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h
@@ -36,8 +36,9 @@ public:
                           Value iv) const = 0;
 
   /// Peeks the lower and upper bound to *fully* traverse the level with
-  /// the given position `p` that the immediate parent level is current at.
-  /// Returns a pair of values for *posLo* and *loopHi* respectively.
+  /// the given position `parentPos`, see SparseTensorIterator::getCurPostion(),
+  /// that the immediate parent level is current at. Returns a pair of values
+  /// for *posLo* and *loopHi* respectively.
   ///
   /// For a dense level, the *posLo* is the linearized position at beginning,
   /// while *loopHi* is the largest *coordinate*, it also implies that the
@@ -45,12 +46,9 @@ public:
   ///
   /// For a sparse level, [posLo, loopHi) specifies the range of index pointer
   /// to load coordinate from the coordinate buffer.
-  ///
-  /// `bound` is only used when the level is `non-unique` and deduplication is
-  /// required. It specifies the max upper bound of the non-unique segment.
   virtual std::pair<Value, Value> peekRangeAt(OpBuilder &b, Location l,
-                                              ValueRange batchPrefix, Value p,
-                                              Value segHi = Value()) const = 0;
+                                              ValueRange batchPrefix,
+                                              ValueRange parentPos) const = 0;
 
   Level getLevel() const { return lvl; }
   LevelType getLT() const { return lt; }
@@ -199,18 +197,17 @@ public:
   }
   virtual Value genNotEndImpl(OpBuilder &b, Location l) = 0;
   virtual Value derefImpl(OpBuilder &b, Location l) = 0;
-  // Gets the current position and the optional *position high* (for
-  // non-unique iterators), the value is essentially the number of sparse
-  // coordinate that the iterator is current visiting. It should be able to
-  // uniquely identify the sparse range for the next level. See
-  // SparseTensorLevel::peekRangeAt();
+  // Gets the ValueRange that together specifies the current position of the
+  // iterator. For a unique level, the position can be a single index points to
+  // the current coordinate being visited. For a non-unique level, an extra
+  // index for the `segment high` is needed to to specifies the range of
+  // duplicated coordinates. The ValueRange should be able to uniquely identify
+  // the sparse range for the next level. See SparseTensorLevel::peekRangeAt();
   //
   // Not every type of iterator supports the operation, e.g., non-empty
   // subsection iterator does not because it represent a range of coordinates
   // instead of just one.
-  virtual std::pair<Value, Value> getCurPosition() const {
-    llvm_unreachable("unsupported");
-  };
+  virtual ValueRange getCurPosition() const { return getCursor(); };
 
   // Returns a pair of values for *upper*, *lower* bound respectively.
   virtual std::pair<Value, Value> genForCond(OpBuilder &b, Location l) {
diff --git a/mlir/lib/Tools/lsp-server-support/Transport.cpp b/mlir/lib/Tools/lsp-server-support/Transport.cpp
index 64dea35614c0..1e90ab32281f 100644
--- a/mlir/lib/Tools/lsp-server-support/Transport.cpp
+++ b/mlir/lib/Tools/lsp-server-support/Transport.cpp
@@ -51,12 +51,12 @@ private:
 
 Reply::Reply(const llvm::json::Value &id, llvm::StringRef method,
              JSONTransport &transport, std::mutex &transportOutputMutex)
-    : id(id), transport(&transport),
+    : method(method), id(id), transport(&transport),
       transportOutputMutex(transportOutputMutex) {}
 
 Reply::Reply(Reply &&other)
-    : replied(other.replied.load()), id(std::move(other.id)),
-      transport(other.transport),
+    : method(other.method), replied(other.replied.load()),
+      id(std::move(other.id)), transport(other.transport),
       transportOutputMutex(other.transportOutputMutex) {
   other.transport = nullptr;
 }
@@ -117,21 +117,29 @@ bool MessageHandler::onCall(llvm::StringRef method, llvm::json::Value params,
 
 bool MessageHandler::onReply(llvm::json::Value id,
                              llvm::Expected<llvm::json::Value> result) {
-  // TODO: Add support for reply callbacks when support for outgoing messages is
-  // added. For now, we just log an error on any replies received.
-  Callback<llvm::json::Value> replyHandler =
-      [&id](llvm::Expected<llvm::json::Value> result) {
-        Logger::error(
-            "received a reply with ID {0}, but there was no such call", id);
-        if (!result)
-          llvm::consumeError(result.takeError());
-      };
-
-  // Log and run the reply handler.
-  if (result)
-    replyHandler(std::move(result));
-  else
-    replyHandler(result.takeError());
+  // Find the response handler in the mapping. If it exists, move it out of the
+  // mapping and erase it.
+  ResponseHandlerTy responseHandler;
+  {
+    std::lock_guard<std::mutex> responseHandlersLock(responseHandlersMutex);
+    auto it = responseHandlers.find(debugString(id));
+    if (it != responseHandlers.end()) {
+      responseHandler = std::move(it->second);
+      responseHandlers.erase(it);
+    }
+  }
+
+  // If we found a response handler, invoke it. Otherwise, log an error.
+  if (responseHandler.second) {
+    Logger::info("--> reply:{0}({1})", responseHandler.first, id);
+    responseHandler.second(std::move(id), std::move(result));
+  } else {
+    Logger::error(
+        "received a reply with ID {0}, but there was no such outgoing request",
+        id);
+    if (!result)
+      llvm::consumeError(result.takeError());
+  }
   return true;
 }
 
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/lang/comprehension.py b/mlir/python/mlir/dialects/linalg/opdsl/lang/comprehension.py
index f7bc81bd2f68..bb43ebf2b692 100644
--- a/mlir/python/mlir/dialects/linalg/opdsl/lang/comprehension.py
+++ b/mlir/python/mlir/dialects/linalg/opdsl/lang/comprehension.py
@@ -296,6 +296,7 @@ class UnaryFn:
     rsqrt = UnaryFnType("rsqrt")
     square = UnaryFnType("square")
     tanh = UnaryFnType("tanh")
+    erf = UnaryFnType("erf")
 
 
 class BinaryFnType:
@@ -335,6 +336,7 @@ class BinaryFn:
     min_signed = BinaryFnType("min_signed")
     max_unsigned = BinaryFnType("max_unsigned")
     min_unsigned = BinaryFnType("min_unsigned")
+    powf = BinaryFnType("powf")
 
 
 class TypeFnType:
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
index 2c8864be1107..ca2bb0c5f7f8 100644
--- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
+++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
@@ -169,6 +169,18 @@ def tanh(
 
 
 @linalg_structured_op
+def erf(
+    I=TensorDef(T1),
+    O=TensorDef(T1, output=True),
+):
+    """Applies erf(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+    """
+    O[None] = UnaryFn.erf(I[None])
+
+
+@linalg_structured_op
 def elemwise_binary(
     lhs=TensorDef(T1),
     rhs=TensorDef(T2),
@@ -319,6 +331,27 @@ def min(
 
 
 @linalg_structured_op
+def powf(
+    lhs=TensorDef(T1),
+    rhs=TensorDef(T1),
+    O=TensorDef(T1, output=True),
+):
+    """Takes the powf(lhs, rhs) between two inputs, elementwise. For powf(arg, 2) use `linalg.square`.
+
+    Only applies to floating point values.
+
+    The shapes and element types must be identical. The appropriate casts,
+    broadcasts and reductions should be done previously to calling this op.
+
+    This means reduction/broadcast/element cast semantics is explicit. Further
+    passes can take that into account when lowering this code. For example,
+    a `linalg.broadcast` + `linalg.powf` sequence can be lowered to a
+    `linalg.generic` with different affine maps for the two operands.
+    """
+    O[None] = BinaryFn.powf(lhs[None], rhs[None])
+
+
+@linalg_structured_op
 def matmul(
     A=TensorDef(T1, S.M, S.K),
     B=TensorDef(T2, S.K, S.N),
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index 1712d3d745b7..439f1e920e39 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -2517,7 +2517,7 @@ func.func @vector_interleave_1d(%a: vector<8xf32>, %b: vector<8xf32>) -> vector<
 //  CHECK-SAME:     %[[LHS:.*]]: vector<[4]xi32>, %[[RHS:.*]]: vector<[4]xi32>)
 func.func @vector_interleave_1d_scalable(%a: vector<[4]xi32>, %b: vector<[4]xi32>) -> vector<[8]xi32>
 {
-  // CHECK: %[[ZIP:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[LHS]], %[[RHS]]) : (vector<[4]xi32>, vector<[4]xi32>) -> vector<[8]xi32>
+  // CHECK: %[[ZIP:.*]] = "llvm.intr.vector.interleave2"(%[[LHS]], %[[RHS]]) : (vector<[4]xi32>, vector<[4]xi32>) -> vector<[8]xi32>
   // CHECK: return %[[ZIP]]
   %0 = vector.interleave %a, %b : vector<[4]xi32>
   return %0 : vector<[8]xi32>
@@ -2541,7 +2541,7 @@ func.func @vector_interleave_2d(%a: vector<2x3xi8>, %b: vector<2x3xi8>) -> vecto
 //  CHECK-SAME:     %[[LHS:.*]]: vector<2x[8]xi16>, %[[RHS:.*]]: vector<2x[8]xi16>)
 func.func @vector_interleave_2d_scalable(%a: vector<2x[8]xi16>, %b: vector<2x[8]xi16>) -> vector<2x[16]xi16>
 {
-  // CHECK: llvm.intr.experimental.vector.interleave2
+  // CHECK: llvm.intr.vector.interleave2
   // CHECK-NOT: vector.interleave {{.*}} : vector<2x[8]xi16>
   %0 = vector.interleave %a, %b : vector<2x[8]xi16>
   return %0 : vector<2x[16]xi16>
diff --git a/mlir/test/Dialect/ArmSME/outer-product-fusion.mlir b/mlir/test/Dialect/ArmSME/outer-product-fusion.mlir
index de9de86003e6..01f54a4cf186 100644
--- a/mlir/test/Dialect/ArmSME/outer-product-fusion.mlir
+++ b/mlir/test/Dialect/ArmSME/outer-product-fusion.mlir
@@ -4,10 +4,10 @@
 // CHECK-SAME:    %[[A0:.*]]: vector<[4]xf16>, %[[B0:.*]]: vector<[4]xf16>, %[[A1:.*]]: vector<[4]xf16>, %[[B1:.*]]: vector<[4]xf16>,
 // CHECK-SAME:    %[[A0_MASK:.*]]: vector<[4]xi1>, %[[B0_MASK:.*]]: vector<[4]xi1>, %[[A1_MASK:.*]]: vector<[4]xi1>, %[[B1_MASK:.*]]: vector<[4]xi1>
 // CHECK-DAG: %[[ACC:.*]] = arith.constant dense<0.000000e+00> : vector<[4]x[4]xf32>
-// CHECK-DAG: %[[LHS:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[A0]], %[[A1]]) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
-// CHECK-DAG: %[[RHS:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[B0]], %[[B1]]) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
-// CHECK-DAG: %[[LHS_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[A0_MASK]], %[[A1_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
-// CHECK-DAG: %[[RHS_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[B0_MASK]], %[[B1_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
+// CHECK-DAG: %[[LHS:.*]] = "llvm.intr.vector.interleave2"(%[[A0]], %[[A1]]) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
+// CHECK-DAG: %[[RHS:.*]] = "llvm.intr.vector.interleave2"(%[[B0]], %[[B1]]) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
+// CHECK-DAG: %[[LHS_MASK:.*]] = "llvm.intr.vector.interleave2"(%[[A0_MASK]], %[[A1_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
+// CHECK-DAG: %[[RHS_MASK:.*]] = "llvm.intr.vector.interleave2"(%[[B0_MASK]], %[[B1_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
 // CHECK-DAG: arm_sme.fmopa_2way %[[LHS]], %[[RHS]] acc(%[[ACC]]) masks(%[[LHS_MASK]], %[[RHS_MASK]]) : vector<[8]xf16>, vector<[8]xf16> into vector<[4]x[4]xf32>
 func.func @outerproduct_add_widening_2way_f16f16f32(
     %a0 : vector<[4]xf16>, %b0 : vector<[4]xf16>,
@@ -225,18 +225,18 @@ func.func @outerproduct_sub_widening_2way_unsigned_i16i16i32(
 // CHECK-SAME:    %[[A2_MASK:[a-z0-9]+]]: vector<[4]xi1>, %[[B2_MASK:[a-z0-9]+]]: vector<[4]xi1>,
 // CHECK-SAME:    %[[A3_MASK:[a-z0-9]+]]: vector<[4]xi1>, %[[B3_MASK:[a-z0-9]+]]: vector<[4]xi1>
 // CHECK-DAG: %[[ACC:.*]] = arith.constant dense<0> : vector<[4]x[4]xi32>
-// CHECK-DAG: %[[LHS0:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[A0]], %[[A2]]) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
-// CHECK-DAG: %[[LHS1:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[A1]], %[[A3]]) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
-// CHECK-DAG: %[[RHS0:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[B0]], %[[B2]]) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
-// CHECK-DAG: %[[RHS1:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[B1]], %[[B3]]) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
-// CHECK-DAG: %[[LHS:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[LHS0]], %[[LHS1]]) : (vector<[8]xi8>, vector<[8]xi8>) -> vector<[16]xi8>
-// CHECK-DAG: %[[RHS:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[RHS0]], %[[RHS1]]) : (vector<[8]xi8>, vector<[8]xi8>) -> vector<[16]xi8>
-// CHECK-DAG: %[[LHS0_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[A0_MASK]], %[[A2_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
-// CHECK-DAG: %[[LHS1_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[A1_MASK]], %[[A3_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
-// CHECK-DAG: %[[RHS0_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[B0_MASK]], %[[B2_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
-// CHECK-DAG: %[[RHS1_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[B1_MASK]], %[[B3_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
-// CHECK-DAG: %[[LHS_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[LHS0_MASK]], %[[LHS1_MASK]]) : (vector<[8]xi1>, vector<[8]xi1>) -> vector<[16]xi1>
-// CHECK-DAG: %[[RHS_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[RHS0_MASK]], %[[RHS1_MASK]]) : (vector<[8]xi1>, vector<[8]xi1>) -> vector<[16]xi1>
+// CHECK-DAG: %[[LHS0:.*]] = "llvm.intr.vector.interleave2"(%[[A0]], %[[A2]]) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
+// CHECK-DAG: %[[LHS1:.*]] = "llvm.intr.vector.interleave2"(%[[A1]], %[[A3]]) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
+// CHECK-DAG: %[[RHS0:.*]] = "llvm.intr.vector.interleave2"(%[[B0]], %[[B2]]) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
+// CHECK-DAG: %[[RHS1:.*]] = "llvm.intr.vector.interleave2"(%[[B1]], %[[B3]]) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
+// CHECK-DAG: %[[LHS:.*]] = "llvm.intr.vector.interleave2"(%[[LHS0]], %[[LHS1]]) : (vector<[8]xi8>, vector<[8]xi8>) -> vector<[16]xi8>
+// CHECK-DAG: %[[RHS:.*]] = "llvm.intr.vector.interleave2"(%[[RHS0]], %[[RHS1]]) : (vector<[8]xi8>, vector<[8]xi8>) -> vector<[16]xi8>
+// CHECK-DAG: %[[LHS0_MASK:.*]] = "llvm.intr.vector.interleave2"(%[[A0_MASK]], %[[A2_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
+// CHECK-DAG: %[[LHS1_MASK:.*]] = "llvm.intr.vector.interleave2"(%[[A1_MASK]], %[[A3_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
+// CHECK-DAG: %[[RHS0_MASK:.*]] = "llvm.intr.vector.interleave2"(%[[B0_MASK]], %[[B2_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
+// CHECK-DAG: %[[RHS1_MASK:.*]] = "llvm.intr.vector.interleave2"(%[[B1_MASK]], %[[B3_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
+// CHECK-DAG: %[[LHS_MASK:.*]] = "llvm.intr.vector.interleave2"(%[[LHS0_MASK]], %[[LHS1_MASK]]) : (vector<[8]xi1>, vector<[8]xi1>) -> vector<[16]xi1>
+// CHECK-DAG: %[[RHS_MASK:.*]] = "llvm.intr.vector.interleave2"(%[[RHS0_MASK]], %[[RHS1_MASK]]) : (vector<[8]xi1>, vector<[8]xi1>) -> vector<[16]xi1>
 // CHECK-DAG: arm_sme.smopa_4way %[[LHS]], %[[RHS]] acc(%[[ACC]]) masks(%[[LHS_MASK]], %[[RHS_MASK]]) : vector<[16]xi8>, vector<[16]xi8> into vector<[4]x[4]xi32>
 func.func @outerproduct_add_widening_4way_signed_i8i8i32(
     %a0 : vector<[4]xi8>, %b0 : vector<[4]xi8>,
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index 601add9a9f91..5e4724c9d309 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -54,14 +54,43 @@ func.func @launch() {
 // CHECK-NEXT: %[[BDIM:.*]] = gpu.block_dim x
 // CHECK-NEXT: = gpu.block_dim y
 // CHECK-NEXT: = gpu.block_dim z
-// CHECK-NEXT: cf.br ^[[BLOCK:.*]]
-// CHECK-NEXT: ^[[BLOCK]]:
 // CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> ()
 // CHECK-NEXT: "some_op"(%[[BID]], %[[BDIM]]) : (index, index) -> ()
 // CHECK-NEXT: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref<?xf32, 1>
 
 // -----
 
+// Verify that we can outline a CFG
+// CHECK-LABEL:  gpu.func @launchCFG_kernel(
+// CHECK: cf.br
+// CHECK: gpu.return
+func.func @launchCFG() {
+  %0 = "op"() : () -> (f32)
+  %1 = "op"() : () -> (memref<?xf32, 1>)
+  %gDimX = arith.constant 8 : index
+  %gDimY = arith.constant 12 : index
+  %gDimZ = arith.constant 16 : index
+  %bDimX = arith.constant 20 : index
+  %bDimY = arith.constant 24 : index
+  %bDimZ = arith.constant 28 : index
+
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY,
+                                       %grid_z = %gDimZ)
+             threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY,
+                                        %block_z = %bDimZ) {
+    "use"(%0): (f32) -> ()
+    cf.br ^bb1
+  ^bb1:
+    "some_op"(%bx, %block_x) : (index, index) -> ()
+    %42 = memref.load %1[%tx] : memref<?xf32, 1>
+    gpu.terminator
+  }
+  return
+}
+
+
+// -----
+
 // This test checks gpu-out-lining can handle gpu.launch kernel from an llvm.func
 // CHECK-LABEL: @launch_from_llvm_func
 llvm.func @launch_from_llvm_func() {
@@ -475,8 +504,6 @@ func.func @launch_cluster() {
 // CHECK-NEXT: %[[CDIM:.*]] = gpu.cluster_dim x
 // CHECK-NEXT: = gpu.cluster_dim y
 // CHECK-NEXT: = gpu.cluster_dim z
-// CHECK-NEXT: cf.br ^[[BLOCK:.*]]
-// CHECK-NEXT: ^[[BLOCK]]:
 // CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> ()
 // CHECK-NEXT: "some_op"(%[[CID]], %[[BID]], %[[BDIM]]) : (index, index, index) -> ()
 // CHECK-NEXT: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref<?xf32, 1>
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
index de1ab9db8e8d..0914f0023210 100644
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -1221,17 +1221,17 @@ func.func @extract_scalable_from_fixed_length_vector(%arg0 : vector<16xf32>) {
 
 // -----
 
-func.func @experimental_vector_interleave2_bad_type0(%vec1: vector<[2]xf16>, %vec2 : vector<[4]xf16>) {
+func.func @vector_interleave2_bad_type0(%vec1: vector<[2]xf16>, %vec2 : vector<[4]xf16>) {
   // expected-error@+1 {{op failed to verify that all of {vec1, vec2} have same type}}
-  %0 = "llvm.intr.experimental.vector.interleave2"(%vec1, %vec2) : (vector<[2]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
+  %0 = "llvm.intr.vector.interleave2"(%vec1, %vec2) : (vector<[2]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
   return
 }
 
 // -----
 
-func.func @experimental_vector_interleave2_bad_type1(%vec1: vector<[2]xf16>, %vec2 : vector<[2]xf16>) {
+func.func @vector_interleave2_bad_type1(%vec1: vector<[2]xf16>, %vec2 : vector<[2]xf16>) {
   // expected-error@+1 {{op failed to verify that result has twice as many elements as 'vec1'}}
-  %0 = "llvm.intr.experimental.vector.interleave2"(%vec1, %vec2) : (vector<[2]xf16>, vector<[2]xf16>) -> vector<[8]xf16>
+  %0 = "llvm.intr.vector.interleave2"(%vec1, %vec2) : (vector<[2]xf16>, vector<[2]xf16>) -> vector<[8]xf16>
   return
 }
 
@@ -1239,9 +1239,9 @@ func.func @experimental_vector_interleave2_bad_type1(%vec1: vector<[2]xf16>, %ve
 
 /// result vector type is not scalable.
 
-func.func @experimental_vector_interleave2_bad_type2(%vec1: vector<[2]xf16>, %vec2 : vector<[2]xf16>) {
+func.func @vector_interleave2_bad_type2(%vec1: vector<[2]xf16>, %vec2 : vector<[2]xf16>) {
   // expected-error@+1 {{op failed to verify that result has twice as many elements as 'vec1'}}
-  %0 = "llvm.intr.experimental.vector.interleave2"(%vec1, %vec2) : (vector<[2]xf16>, vector<[2]xf16>) -> vector<4xf16>
+  %0 = "llvm.intr.vector.interleave2"(%vec1, %vec2) : (vector<[2]xf16>, vector<[2]xf16>) -> vector<4xf16>
   return
 }
 
@@ -1250,9 +1250,9 @@ func.func @experimental_vector_interleave2_bad_type2(%vec1: vector<[2]xf16>, %ve
 
 /// element type doesn't match.
 
-func.func @experimental_vector_interleave2_bad_type3(%vec1: vector<[2]xf16>, %vec2 : vector<[2]xf16>) {
+func.func @vector_interleave2_bad_type3(%vec1: vector<[2]xf16>, %vec2 : vector<[2]xf16>) {
   // expected-error@+1 {{op failed to verify that result has twice as many elements as 'vec1'}}
-  %0 = "llvm.intr.experimental.vector.interleave2"(%vec1, %vec2) : (vector<[2]xf16>, vector<[2]xf16>) -> vector<[4]xf32>
+  %0 = "llvm.intr.vector.interleave2"(%vec1, %vec2) : (vector<[2]xf16>, vector<[2]xf16>) -> vector<[4]xf32>
   return
 }
 
diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
index 31acf2b95e46..3b94db389f54 100644
--- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir
+++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
@@ -342,10 +342,10 @@ func.func @mixed_vect(%arg0: vector<8xf32>, %arg1: vector<4xf32>, %arg2: vector<
   return
 }
 
-// CHECK-LABEL: @experimental_vector_interleave2
-func.func @experimental_vector_interleave2(%vec1: vector<[4]xf16>, %vec2 : vector<[4]xf16>) {
-  // CHECK: = "llvm.intr.experimental.vector.interleave2"({{.*}}) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
-  %0 = "llvm.intr.experimental.vector.interleave2"(%vec1, %vec2) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
+// CHECK-LABEL: @vector_interleave2
+func.func @vector_interleave2(%vec1: vector<[4]xf16>, %vec2 : vector<[4]xf16>) {
+  // CHECK: = "llvm.intr.vector.interleave2"({{.*}}) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
+  %0 = "llvm.intr.vector.interleave2"(%vec1, %vec2) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
   return
 }
 
diff --git a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
index add34412b92f..667ea3c18c8a 100644
--- a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
@@ -693,6 +693,27 @@ func.func @generalize_tanh(%arg: memref<7x14x21xf32>, %out: memref<7x14x21xf32>)
 
 // -----
 
+func.func @generalize_erf(%arg: memref<7x14x21xf32>, %out: memref<7x14x21xf32>) {
+  linalg.erf ins(%arg : memref<7x14x21xf32>) outs(%out : memref<7x14x21xf32>)
+  return
+}
+
+// CHECK: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+// CHECK: func @generalize_erf
+// CHECK-SAME: (%[[ARG:.+]]: memref<7x14x21xf32>, %[[OUT:.+]]: memref<7x14x21xf32>)
+
+// CHECK: linalg.generic
+// CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP]]]
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"]}
+// CHECK-SAME:  ins(%[[LHS]] : memref<7x14x21xf32>) outs(%[[OUT]] : memref<7x14x21xf32>)
+
+// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32)
+// CHECK-NEXT:      %[[erf:.+]] = math.erf %[[BBARG0]] : f32
+// CHECK-NEXT:      linalg.yield %[[erf]] : f32
+
+// -----
+
 func.func @generalize_max(%lhs: memref<7x14x21xf32>, %rhs: memref<7x14x21xf32>,
                           %out: memref<7x14x21xf32>) {
   linalg.max ins(%lhs, %rhs : memref<7x14x21xf32>, memref<7x14x21xf32>)
@@ -741,6 +762,33 @@ func.func @generalize_min(%lhs: memref<7x14x21xf32>, %rhs: memref<7x14x21xf32>,
 // CHECK-NEXT:      %[[min:.+]] = arith.minimumf %[[BBARG0]], %[[BBARG1]] : f32
 // CHECK-NEXT:      linalg.yield %[[min]] : f32
 
+
+// -----
+
+func.func @generalize_powf(%lhs: memref<7x14x21xf32>, %rhs: memref<7x14x21xf32>,
+                          %out: memref<7x14x21xf32>) {
+  linalg.powf ins(%lhs, %rhs : memref<7x14x21xf32>, memref<7x14x21xf32>)
+             outs(%out : memref<7x14x21xf32>)
+  return
+}
+
+// CHECK: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+// CHECK: func @generalize_powf
+// CHECK-SAME: (%[[LHS:.+]]: memref<7x14x21xf32>, %[[RHS:.+]]: memref<7x14x21xf32>,
+// CHECK-SAME:  %[[OUT:.+]]: memref<7x14x21xf32>)
+
+// CHECK: linalg.generic
+// CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP]], #[[MAP]]]
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"]}
+// CHECK-SAME:  ins(%[[LHS]], %[[RHS]] : memref<7x14x21xf32>, memref<7x14x21xf32>)
+// CHECK-SAME: outs(%[[OUT]] : memref<7x14x21xf32>)
+
+// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32, %[[BBARG2:.+]]: f32)
+// CHECK-NEXT:      %[[powf:.+]] = math.powf %[[BBARG0]], %[[BBARG1]] : f32
+// CHECK-NEXT:      linalg.yield %[[powf]] : f32
+
+
 // -----
 
 
diff --git a/mlir/test/Dialect/Linalg/named-ops-fail.mlir b/mlir/test/Dialect/Linalg/named-ops-fail.mlir
index f66608e71ffc..e92a77aa7ad0 100644
--- a/mlir/test/Dialect/Linalg/named-ops-fail.mlir
+++ b/mlir/test/Dialect/Linalg/named-ops-fail.mlir
@@ -272,6 +272,22 @@ func.func @tanh_broadcast(%arg: memref<8x16xf32>, %out: memref<4x8x16xf32>) {
 
 // -----
 
+func.func @erf_type_cast(%arg: memref<4x8x16xf16>, %out: memref<4x8x16xf32>) {
+  // CHECK: operand 1 ('f16') doesn't match the element type of the enclosing linalg.generic op ('f32')
+  linalg.erf ins(%arg : memref<4x8x16xf16>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @erf_broadcast(%arg: memref<8x16xf32>, %out: memref<4x8x16xf32>) {
+  // CHECK: op expected operand rank (2) to match the result rank of indexing_map #0 (3)
+  linalg.erf ins(%arg : memref<8x16xf32>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
 func.func @max_type_cast(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf16>, %arg2: memref<4x8x16xf32>) {
   // CHECK: op requires the same type for all operands and results
   linalg.max ins(%arg0, %arg1 : memref<4x8x16xf32>, memref<4x8x16xf16>) outs(%arg2: memref<4x8x16xf32>)
@@ -301,3 +317,20 @@ func.func @min_broadcast(%arg0: memref<8x16xf32>, %arg1: memref<4x8x16xf32>, %ar
   linalg.min ins(%arg0, %arg1 : memref<8x16xf32>, memref<4x8x16xf32>) outs(%arg2: memref<4x8x16xf32>)
   return
 }
+
+// -----
+
+func.func @powf_type_cast(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf16>, %arg2: memref<4x8x16xf32>) {
+  // CHECK: op requires the same type for all operands and results
+  linalg.powf ins(%arg0, %arg1 : memref<4x8x16xf32>, memref<4x8x16xf16>) outs(%arg2: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @powf_broadcast(%arg0: memref<8x16xf32>, %arg1: memref<4x8x16xf32>, %arg2: memref<4x8x16xf32>) {
+  // CHECK: op expected operand rank (2) to match the result rank of indexing_map #0 (3)
+  linalg.powf ins(%arg0, %arg1 : memref<8x16xf32>, memref<4x8x16xf32>) outs(%arg2: memref<4x8x16xf32>)
+  return
+}
+
diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir
index cf59f6736100..fefe5578947f 100644
--- a/mlir/test/Dialect/Linalg/named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/named-ops.mlir
@@ -1783,6 +1783,37 @@ func.func @tanh_tensor(%arg0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> {
 
 // -----
 
+// CHECK-LABEL: func @erf_dynamic
+func.func @erf_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>) {
+  // CHECK: linalg.erf
+  // CHECK-SAME: ins(%{{.+}} : memref<?x?x?xf32>) outs(%{{.+}} : memref<?x?x?xf32>)
+  linalg.erf ins(%arg0 : memref<?x?x?xf32>) outs(%arg1: memref<?x?x?xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @erf_static
+func.func @erf_static(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf32>) {
+  // CHECK: linalg.erf
+  // CHECK-SAME: ins(%{{.+}} : memref<4x8x16xf32>) outs(%{{.+}} : memref<4x8x16xf32>)
+  linalg.erf ins(%arg0 : memref<4x8x16xf32>) outs(%arg1: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @erf_tensor
+func.func @erf_tensor(%arg0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> {
+  %0 = tensor.empty() : tensor<4x8x16xf32>
+  // CHECK: linalg.erf
+  // CHECK-SAME: ins(%{{.+}} : tensor<4x8x16xf32>) outs(%{{.+}} : tensor<4x8x16xf32>)
+  %1 = linalg.erf ins(%arg0 : tensor<4x8x16xf32>) outs(%0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+  return %1 : tensor<4x8x16xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @max_dynamic
 func.func @max_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>, %arg2: memref<?x?x?xf32>) {
   // CHECK: linalg.max
@@ -1851,6 +1882,40 @@ func.func @min_tensor(%arg0: tensor<4x8x16xf32>, %arg1: tensor<4x8x16xf32>) -> t
 
 // -----
 
+// CHECK-LABEL: func @powf_dynamic
+func.func @powf_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>, %arg2: memref<?x?x?xf32>) {
+  // CHECK: linalg.powf
+  // CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<?x?x?xf32>, memref<?x?x?xf32>)
+  // CHECK-SAME: outs(%{{.+}} : memref<?x?x?xf32>)
+  linalg.powf ins(%arg0, %arg1 : memref<?x?x?xf32>, memref<?x?x?xf32>) outs(%arg2: memref<?x?x?xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @powf_static
+func.func @powf_static(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf32>, %arg2: memref<4x8x16xf32>) {
+  // CHECK: linalg.powf
+  // CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<4x8x16xf32>, memref<4x8x16xf32>)
+  // CHECK-SAME: outs(%{{.+}} : memref<4x8x16xf32>)
+  linalg.powf ins(%arg0, %arg1 : memref<4x8x16xf32>, memref<4x8x16xf32>) outs(%arg2: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @powf_tensor
+func.func @powf_tensor(%arg0: tensor<4x8x16xf32>, %arg1: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> {
+  %0 = tensor.empty() : tensor<4x8x16xf32>
+  // CHECK: linalg.powf
+  // CHECK-SAME: ins(%{{.+}}, %{{.+}} : tensor<4x8x16xf32>, tensor<4x8x16xf32>)
+  // CHECK-SAME: outs(%{{.+}} : tensor<4x8x16xf32>)
+  %1 = linalg.powf ins(%arg0, %arg1 : tensor<4x8x16xf32>, tensor<4x8x16xf32>) outs(%0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+  return %1 : tensor<4x8x16xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @fill_tensor
 func.func @fill_tensor(%f: f32, %v: vector<2x4xf32>) -> (tensor<f32>, tensor<vector<2x4xf32>>) {
   %e0 = tensor.empty() : tensor<f32>
diff --git a/mlir/unittests/CMakeLists.txt b/mlir/unittests/CMakeLists.txt
index 6fad249a0b2f..6d8aa290e82f 100644
--- a/mlir/unittests/CMakeLists.txt
+++ b/mlir/unittests/CMakeLists.txt
@@ -20,6 +20,7 @@ add_subdirectory(Support)
 add_subdirectory(Rewrite)
 add_subdirectory(TableGen)
 add_subdirectory(Target)
+add_subdirectory(Tools)
 add_subdirectory(Transforms)
 
 if(MLIR_ENABLE_EXECUTION_ENGINE)
diff --git a/mlir/unittests/Tools/CMakeLists.txt b/mlir/unittests/Tools/CMakeLists.txt
new file mode 100644
index 000000000000..a97588d92866
--- /dev/null
+++ b/mlir/unittests/Tools/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(lsp-server-support)
diff --git a/mlir/unittests/Tools/lsp-server-support/CMakeLists.txt b/mlir/unittests/Tools/lsp-server-support/CMakeLists.txt
new file mode 100644
index 000000000000..3aa8b9c4bc77
--- /dev/null
+++ b/mlir/unittests/Tools/lsp-server-support/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_mlir_unittest(MLIRLspServerSupportTests
+  Transport.cpp
+)
+target_link_libraries(MLIRLspServerSupportTests
+  PRIVATE
+  MLIRLspServerSupportLib)
diff --git a/mlir/unittests/Tools/lsp-server-support/Transport.cpp b/mlir/unittests/Tools/lsp-server-support/Transport.cpp
new file mode 100644
index 000000000000..fee218405952
--- /dev/null
+++ b/mlir/unittests/Tools/lsp-server-support/Transport.cpp
@@ -0,0 +1,173 @@
+//===- Transport.cpp - LSP JSON transport unit tests ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Tools/lsp-server-support/Transport.h"
+#include "mlir/Tools/lsp-server-support/Logging.h"
+#include "mlir/Tools/lsp-server-support/Protocol.h"
+#include "llvm/Support/FileSystem.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using namespace mlir;
+using namespace mlir::lsp;
+using namespace testing;
+
+namespace {
+
+TEST(TransportTest, SendReply) {
+  std::string out;
+  llvm::raw_string_ostream os(out);
+  JSONTransport transport(nullptr, os);
+  MessageHandler handler(transport);
+
+  transport.reply(1989, nullptr);
+  EXPECT_THAT(out, HasSubstr("\"id\":1989"));
+  EXPECT_THAT(out, HasSubstr("\"result\":null"));
+}
+
+class TransportInputTest : public Test {
+  llvm::SmallVector<char> inputPath;
+  std::FILE *in = nullptr;
+  std::string output = "";
+  llvm::raw_string_ostream os;
+  std::optional<JSONTransport> transport = std::nullopt;
+  std::optional<MessageHandler> messageHandler = std::nullopt;
+
+protected:
+  TransportInputTest() : os(output) {}
+
+  void SetUp() override {
+    std::error_code ec =
+        llvm::sys::fs::createTemporaryFile("lsp-unittest", "json", inputPath);
+    ASSERT_FALSE(ec) << "Could not create temporary file: " << ec.message();
+
+    in = std::fopen(inputPath.data(), "r");
+    ASSERT_TRUE(in) << "Could not open temporary file: "
+                    << std::strerror(errno);
+    transport.emplace(in, os, JSONStreamStyle::Delimited);
+    messageHandler.emplace(*transport);
+  }
+
+  void TearDown() override {
+    EXPECT_EQ(std::fclose(in), 0)
+        << "Could not close temporary file FD: " << std::strerror(errno);
+    std::error_code ec =
+        llvm::sys::fs::remove(inputPath, /*IgnoreNonExisting=*/false);
+    EXPECT_FALSE(ec) << "Could not remove temporary file '" << inputPath.data()
+                     << "': " << ec.message();
+  }
+
+  void writeInput(StringRef buffer) {
+    std::error_code ec;
+    llvm::raw_fd_ostream os(inputPath.data(), ec);
+    ASSERT_FALSE(ec) << "Could not write to '" << inputPath.data()
+                     << "': " << ec.message();
+    os << buffer;
+    os.close();
+  }
+
+  StringRef getOutput() const { return output; }
+  MessageHandler &getMessageHandler() { return *messageHandler; }
+
+  void runTransport() {
+    bool gotEOF = false;
+    llvm::Error err = llvm::handleErrors(
+        transport->run(*messageHandler), [&](const llvm::ECError &ecErr) {
+          gotEOF = ecErr.convertToErrorCode() == std::errc::io_error;
+        });
+    llvm::consumeError(std::move(err));
+    EXPECT_TRUE(gotEOF);
+  }
+};
+
+TEST_F(TransportInputTest, RequestWithInvalidParams) {
+  struct Handler {
+    void onMethod(const TextDocumentItem &params,
+                  mlir::lsp::Callback<TextDocumentIdentifier> callback) {}
+  } handler;
+  getMessageHandler().method("invalid-params-request", &handler,
+                             &Handler::onMethod);
+
+  writeInput("{\"jsonrpc\":\"2.0\",\"id\":92,"
+             "\"method\":\"invalid-params-request\",\"params\":{}}\n");
+  runTransport();
+  EXPECT_THAT(getOutput(), HasSubstr("error"));
+  EXPECT_THAT(getOutput(), HasSubstr("missing value at (root).uri"));
+}
+
+TEST_F(TransportInputTest, NotificationWithInvalidParams) {
+  // JSON parsing errors are only reported via error logging. As a result, this
+  // test can't make any expectations -- but it prints the output anyway, by way
+  // of demonstration.
+  Logger::setLogLevel(Logger::Level::Error);
+
+  struct Handler {
+    void onNotification(const TextDocumentItem &params) {}
+  } handler;
+  getMessageHandler().notification("invalid-params-notification", &handler,
+                                   &Handler::onNotification);
+
+  writeInput("{\"jsonrpc\":\"2.0\",\"method\":\"invalid-params-notification\","
+             "\"params\":{}}\n");
+  runTransport();
+}
+
+TEST_F(TransportInputTest, MethodNotFound) {
+  writeInput("{\"jsonrpc\":\"2.0\",\"id\":29,\"method\":\"ack\"}\n");
+  runTransport();
+  EXPECT_THAT(getOutput(), HasSubstr("\"id\":29"));
+  EXPECT_THAT(getOutput(), HasSubstr("\"error\""));
+  EXPECT_THAT(getOutput(), HasSubstr("\"message\":\"method not found: ack\""));
+}
+
+TEST_F(TransportInputTest, OutgoingNotification) {
+  auto notifyFn = getMessageHandler().outgoingNotification<CompletionList>(
+      "outgoing-notification");
+  notifyFn(CompletionList{});
+  EXPECT_THAT(getOutput(), HasSubstr("\"method\":\"outgoing-notification\""));
+}
+
+TEST_F(TransportInputTest, ResponseHandlerNotFound) {
+  // Unhandled responses are only reported via error logging. As a result, this
+  // test can't make any expectations -- but it prints the output anyway, by way
+  // of demonstration.
+  Logger::setLogLevel(Logger::Level::Error);
+  writeInput("{\"jsonrpc\":\"2.0\",\"id\":81,\"result\":null}\n");
+  runTransport();
+}
+
+TEST_F(TransportInputTest, OutgoingRequest) {
+  // Make some outgoing requests.
+  int responseCallbackInvoked = 0;
+  auto callFn = getMessageHandler().outgoingRequest<CompletionList>(
+      "outgoing-request",
+      [&responseCallbackInvoked](llvm::json::Value id,
+                                 llvm::Expected<llvm::json::Value> value) {
+        // Make expectations on the expected response.
+        EXPECT_EQ(id, 83);
+        ASSERT_TRUE((bool)value);
+        EXPECT_EQ(debugString(*value), "{\"foo\":6}");
+        responseCallbackInvoked += 1;
+        llvm::outs() << "here!!!\n";
+      });
+  callFn({}, 82);
+  callFn({}, 83);
+  callFn({}, 84);
+  EXPECT_THAT(getOutput(), HasSubstr("\"method\":\"outgoing-request\""));
+  EXPECT_EQ(responseCallbackInvoked, 0);
+
+  // One of the requests receives a response. The message handler handles this
+  // response by invoking the callback from above. Subsequent responses with the
+  // same ID are ignored.
+  writeInput("{\"jsonrpc\":\"2.0\",\"id\":83,\"result\":{\"foo\":6}}\n"
+             "// -----\n"
+             "{\"jsonrpc\":\"2.0\",\"id\":83,\"result\":{\"bar\":8}}\n");
+  runTransport();
+  EXPECT_EQ(responseCallbackInvoked, 1);
+}
+} // namespace
diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index abc8baa0805f..a416ac29873f 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -151,6 +151,25 @@ if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
   message(FATAL_ERROR "Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS")
 endif()
 
+set(LIBOMPTARGET_ALL_PLUGIN_TARGETS amdgpu cuda host)
+set(LIBOMPTARGET_PLUGINS_TO_BUILD "all" CACHE STRING
+    "Semicolon-separated list of plugins to use: cuda, amdgpu, host or \"all\".")
+
+if(LIBOMPTARGET_PLUGINS_TO_BUILD STREQUAL "all")
+  set(LIBOMPTARGET_PLUGINS_TO_BUILD ${LIBOMPTARGET_ALL_PLUGIN_TARGETS})
+endif()
+
+set(LIBOMPTARGET_ENUM_PLUGIN_TARGETS "")
+foreach(plugin IN LISTS LIBOMPTARGET_PLUGINS_TO_BUILD)
+  set(LIBOMPTARGET_ENUM_PLUGIN_TARGETS
+      "${LIBOMPTARGET_ENUM_PLUGIN_TARGETS}PLUGIN_TARGET(${plugin})\n")
+endforeach()
+string(STRIP ${LIBOMPTARGET_ENUM_PLUGIN_TARGETS} LIBOMPTARGET_ENUM_PLUGIN_TARGETS)
+configure_file(
+  ${CMAKE_CURRENT_SOURCE_DIR}/include/Shared/Targets.def.in
+  ${CMAKE_CURRENT_BINARY_DIR}/include/Shared/Targets.def
+)
+
 include_directories(${LIBOMPTARGET_LLVM_INCLUDE_DIRS})
 
 # This is a list of all the targets that are supported/tested right now.
@@ -288,6 +307,7 @@ set(LIBOMPTARGET_GPU_LIBC_SUPPORT ${LLVM_LIBC_GPU_BUILD} CACHE BOOL
 pythonize_bool(LIBOMPTARGET_GPU_LIBC_SUPPORT)
 
 set(LIBOMPTARGET_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
+set(LIBOMPTARGET_BINARY_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include)
 message(STATUS "OpenMP tools dir in libomptarget: ${LIBOMP_OMP_TOOLS_INCLUDE_DIR}")
 if(LIBOMP_OMP_TOOLS_INCLUDE_DIR)
   include_directories(${LIBOMP_OMP_TOOLS_INCLUDE_DIR})
diff --git a/offload/include/Shared/Targets.def.in b/offload/include/Shared/Targets.def.in
new file mode 100644
index 000000000000..f34b523b4542
--- /dev/null
+++ b/offload/include/Shared/Targets.def.in
@@ -0,0 +1,20 @@
+//===-- Shared/Targets.def - Target plugin enumerator -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Enumerates over all of the supported target plugins that are available to
+// the offloading library.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PLUGIN_TARGET
+#  error Please define the macro PLUGIN_TARGET(TargetName)
+#endif
+
+@LIBOMPTARGET_ENUM_PLUGIN_TARGETS@
+
+#undef PLUGIN_TARGET
diff --git a/offload/plugins-nextgen/CMakeLists.txt b/offload/plugins-nextgen/CMakeLists.txt
index dbd82ac94517..df625e97c7eb 100644
--- a/offload/plugins-nextgen/CMakeLists.txt
+++ b/offload/plugins-nextgen/CMakeLists.txt
@@ -69,9 +69,12 @@ function(add_target_library target_name lib_name)
   set_target_properties(${target_name} PROPERTIES CXX_VISIBILITY_PRESET protected)
 endfunction()
 
-add_subdirectory(amdgpu)
-add_subdirectory(cuda)
-add_subdirectory(host)
+foreach(plugin IN LISTS LIBOMPTARGET_PLUGINS_TO_BUILD)
+  if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${plugin})
+    message(FATAL_ERROR "Unknown plugin target '${plugin}'")
+  endif()
+  add_subdirectory(${plugin})
+endforeach()
 
 # Make sure the parent scope can see the plugins that will be created.
 set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
diff --git a/offload/plugins-nextgen/common/CMakeLists.txt b/offload/plugins-nextgen/common/CMakeLists.txt
index a7350e662a7c..acf0af63f050 100644
--- a/offload/plugins-nextgen/common/CMakeLists.txt
+++ b/offload/plugins-nextgen/common/CMakeLists.txt
@@ -62,6 +62,7 @@ target_link_options(PluginCommon PUBLIC ${offload_link_flags})
 target_include_directories(PluginCommon PUBLIC 
   ${CMAKE_CURRENT_SOURCE_DIR}/include
   ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
+  ${LIBOMPTARGET_BINARY_INCLUDE_DIR}
   ${LIBOMPTARGET_INCLUDE_DIR}
 )
 
diff --git a/offload/src/CMakeLists.txt b/offload/src/CMakeLists.txt
index 8b7be98771a9..eda5a85ff1ab 100644
--- a/offload/src/CMakeLists.txt
+++ b/offload/src/CMakeLists.txt
@@ -37,6 +37,7 @@ add_llvm_library(omptarget
 
   ADDITIONAL_HEADER_DIRS
   ${LIBOMPTARGET_INCLUDE_DIR}
+  ${LIBOMPTARGET_BINARY_INCLUDE_DIR}
 
   LINK_COMPONENTS
   Support
@@ -49,7 +50,9 @@ add_llvm_library(omptarget
   NO_INSTALL_RPATH
   BUILDTREE_ONLY
 )
-target_include_directories(omptarget PRIVATE ${LIBOMPTARGET_INCLUDE_DIR})
+target_include_directories(omptarget PRIVATE 
+  ${LIBOMPTARGET_INCLUDE_DIR} ${LIBOMPTARGET_BINARY_INCLUDE_DIR}
+)
 
 if (LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
   target_link_libraries(omptarget PRIVATE
@@ -65,20 +68,6 @@ target_compile_definitions(omptarget PRIVATE
 target_compile_options(omptarget PUBLIC ${offload_compile_flags})
 target_link_options(omptarget PUBLIC ${offload_link_flags})
 
-macro(check_plugin_target target)
-if (TARGET omptarget.rtl.${target})
-	list(APPEND LIBOMPTARGET_PLUGINS_TO_LOAD ${target})
-endif()
-endmacro()
-
-set(LIBOMPTARGET_PLUGINS_TO_LOAD "" CACHE STRING
-  "Comma separated list of plugin names to look for at runtime")
-if (NOT LIBOMPTARGET_PLUGINS_TO_LOAD)
-	check_plugin_target(cuda)
-	check_plugin_target(amdgpu)
-	check_plugin_target(host)
-endif()
-
 list(TRANSFORM LIBOMPTARGET_PLUGINS_TO_LOAD PREPEND "\"libomptarget.rtl.")
 list(TRANSFORM LIBOMPTARGET_PLUGINS_TO_LOAD APPEND "\"")
 list(JOIN LIBOMPTARGET_PLUGINS_TO_LOAD "," ENABLED_OFFLOAD_PLUGINS)
diff --git a/offload/src/PluginManager.cpp b/offload/src/PluginManager.cpp
index 792cae3e3dd5..dbb556c179e5 100644
--- a/offload/src/PluginManager.cpp
+++ b/offload/src/PluginManager.cpp
@@ -23,9 +23,6 @@ using namespace llvm::sys;
 
 PluginManager *PM = nullptr;
 
-// List of all plugins that can support offloading.
-static const char *RTLNames[] = {ENABLED_OFFLOAD_PLUGINS};
-
 Expected<std::unique_ptr<PluginAdaptorTy>>
 PluginAdaptorTy::create(const std::string &Name) {
   DP("Attempting to load library '%s'...\n", Name.c_str());
@@ -95,17 +92,19 @@ void PluginManager::init() {
 
   // Attempt to open all the plugins and, if they exist, check if the interface
   // is correct and if they are supporting any devices.
-  for (const char *Name : RTLNames) {
-    auto PluginAdaptorOrErr =
-        PluginAdaptorTy::create(std::string(Name) + ".so");
-    if (!PluginAdaptorOrErr) {
-      [[maybe_unused]] std::string InfoMsg =
-          toString(PluginAdaptorOrErr.takeError());
-      DP("%s", InfoMsg.c_str());
-    } else {
-      PluginAdaptors.push_back(std::move(*PluginAdaptorOrErr));
-    }
-  }
+#define PLUGIN_TARGET(Name)                                                    \
+  do {                                                                         \
+    auto PluginAdaptorOrErr =                                                  \
+        PluginAdaptorTy::create("libomptarget.rtl." #Name ".so");              \
+    if (!PluginAdaptorOrErr) {                                                 \
+      [[maybe_unused]] std::string InfoMsg =                                   \
+          toString(PluginAdaptorOrErr.takeError());                            \
+      DP("%s", InfoMsg.c_str());                                               \
+    } else {                                                                   \
+      PluginAdaptors.push_back(std::move(*PluginAdaptorOrErr));                \
+    }                                                                          \
+  } while (false);
+#include "Shared/Targets.def"
 
   DP("RTLs loaded!\n");
 }
diff --git a/utils/bazel/llvm-project-overlay/llvm/driver.bzl b/utils/bazel/llvm-project-overlay/llvm/driver.bzl
index 888626d7cf84..10796d919834 100644
--- a/utils/bazel/llvm-project-overlay/llvm/driver.bzl
+++ b/utils/bazel/llvm-project-overlay/llvm/driver.bzl
@@ -37,7 +37,7 @@ _TOOLS = {
 # aliases for a given tool.
 _EXTRA_ALIASES = {
     "clang": ["clang++", "clang-cl", "clang-cpp"],
-    "lld": ["lld-link", "ld.lld", "ld64.lld", "wasm-ld"],
+    "lld": ["ld", "lld-link", "ld.lld", "ld64.lld", "wasm-ld"],
     "llvm-ar": ["ranlib", "lib", "dlltool"],
     "llvm-objcopy": ["bitcode-strip", "install-name-tool", "strip"],
     "llvm-objdump": ["otool"],
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 6a6f8fc13410..52c874c344c5 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -6697,6 +6697,7 @@ cc_library(
     ]),
     includes = ["include"],
     deps = [
+        ":ArithDialect",
         ":IR",
         ":InferTypeOpInterface",
         ":PolynomialAttributesIncGen",
author	Amir Ayupov <aaupov@fb.com>	2024-04-29 16:27:24 -0700
committer	Amir Ayupov <aaupov@fb.com>	2024-04-29 16:27:24 -0700
commit	423bc644a2ebb0408d389c5818aee52b11fefe9c (patch)
tree	4b44b1acfda8a2189336f9277dac00dcc975fe29
parent	b7457cc79e1dcd44cfe1448b0a5f0abe3c66f398 (diff)
parent	c4c4e17c99f8d4f79bb9e1e3819d1d76e5e09ed1 (diff)